xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision dc5cefde2fdcdb92db1db0a399b233ecd778849b)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt       nz,idx,idt,j,i,oidx;
125   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
126   MatScalar      *aa=a->a,*v;
127   PetscScalar    s1,s2,x1,x2;
128   PetscScalar    *x,*b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode ierr;
182   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183   PetscInt       *diag = a->diag,oidx;
184   MatScalar      *aa=a->a,*v;
185   PetscScalar    s1,s2,s3,x1,x2,x3;
186   PetscScalar    *x,*b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode ierr;
244   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt       nz,idx,idt,j,i,oidx;
246   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
247   MatScalar      *aa=a->a,*v;
248   PetscScalar    s1,s2,s3,x1,x2,x3;
249   PetscScalar    *x,*b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode ierr;
306   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307   PetscInt       *diag = a->diag,oidx;
308   MatScalar      *aa=a->a,*v;
309   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
310   PetscScalar    *x,*b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode ierr;
371   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt       nz,idx,idt,j,i,oidx;
373   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
374   MatScalar      *aa=a->a,*v;
375   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
376   PetscScalar    *x,*b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode ierr;
436   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
437   PetscInt       *diag = a->diag,oidx;
438   MatScalar      *aa=a->a,*v;
439   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
440   PetscScalar    *x,*b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
509   PetscScalar    *x,*b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode ierr;
573   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
574   PetscInt       *diag = a->diag,oidx;
575   MatScalar      *aa=a->a,*v;
576   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
577   PetscScalar    *x,*b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode ierr;
647   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt       nz,idx,idt,j,i,oidx;
649   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
650   MatScalar      *aa=a->a,*v;
651   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
652   PetscScalar    *x,*b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode ierr;
721   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
722   PetscInt       *diag = a->diag,oidx;
723   MatScalar      *aa=a->a,*v;
724   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
725   PetscScalar    *x,*b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode ierr;
797   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt       nz,idx,idt,j,i,oidx;
799   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
800   MatScalar      *aa=a->a,*v;
801   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
802   PetscScalar    *x,*b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
873   IS             iscol=a->col,isrow=a->row;
874   PetscErrorCode ierr;
875   const PetscInt *r,*c,*rout,*cout;
876   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
877   PetscInt       *diag = a->diag;
878   MatScalar      *aa=a->a,*v;
879   PetscScalar    s1,*x,*b,*t;
880 
881   PetscFunctionBegin;
882   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
883   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
884   t  = a->solve_work;
885 
886   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
887   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
888 
889   /* copy the b into temp work space according to permutation */
890   for (i=0; i<n; i++) {
891     t[i] = b[c[i]];
892   }
893 
894   /* forward solve the U^T */
895   for (i=0; i<n; i++) {
896 
897     v     = aa + diag[i];
898     /* multiply by the inverse of the block diagonal */
899     s1    = (*v++)*t[i];
900     vi    = aj + diag[i] + 1;
901     nz    = ai[i+1] - diag[i] - 1;
902     while (nz--) {
903       t[*vi++]  -= (*v++)*s1;
904     }
905     t[i]   = s1;
906   }
907   /* backward solve the L^T */
908   for (i=n-1; i>=0; i--){
909     v    = aa + diag[i] - 1;
910     vi   = aj + diag[i] - 1;
911     nz   = diag[i] - ai[i];
912     s1   = t[i];
913     while (nz--) {
914       t[*vi--]   -=  (*v--)*s1;
915     }
916   }
917 
918   /* copy t into x according to permutation */
919   for (i=0; i<n; i++) {
920     x[r[i]]   = t[i];
921   }
922 
923   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
924   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
925   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
926   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
927   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 #undef __FUNCT__
932 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
933 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
934 {
935   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
936   IS             iscol=a->col,isrow=a->row;
937   PetscErrorCode ierr;
938   const PetscInt *r,*c,*rout,*cout;
939   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
940   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
941   MatScalar      *aa=a->a,*v;
942   PetscScalar    s1,s2,x1,x2;
943   PetscScalar    *x,*b,*t;
944 
945   PetscFunctionBegin;
946   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
947   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
948   t  = a->solve_work;
949 
950   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
951   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
952 
953   /* copy the b into temp work space according to permutation */
954   ii = 0;
955   for (i=0; i<n; i++) {
956     ic      = 2*c[i];
957     t[ii]   = b[ic];
958     t[ii+1] = b[ic+1];
959     ii += 2;
960   }
961 
962   /* forward solve the U^T */
963   idx = 0;
964   for (i=0; i<n; i++) {
965 
966     v     = aa + 4*diag[i];
967     /* multiply by the inverse of the block diagonal */
968     x1    = t[idx];   x2 = t[1+idx];
969     s1 = v[0]*x1  +  v[1]*x2;
970     s2 = v[2]*x1  +  v[3]*x2;
971     v += 4;
972 
973     vi    = aj + diag[i] + 1;
974     nz    = ai[i+1] - diag[i] - 1;
975     while (nz--) {
976       oidx = 2*(*vi++);
977       t[oidx]   -= v[0]*s1  +  v[1]*s2;
978       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
979       v  += 4;
980     }
981     t[idx]   = s1;t[1+idx] = s2;
982     idx += 2;
983   }
984   /* backward solve the L^T */
985   for (i=n-1; i>=0; i--){
986     v    = aa + 4*diag[i] - 4;
987     vi   = aj + diag[i] - 1;
988     nz   = diag[i] - ai[i];
989     idt  = 2*i;
990     s1 = t[idt];  s2 = t[1+idt];
991     while (nz--) {
992       idx   = 2*(*vi--);
993       t[idx]   -=  v[0]*s1 +  v[1]*s2;
994       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
995       v -= 4;
996     }
997   }
998 
999   /* copy t into x according to permutation */
1000   ii = 0;
1001   for (i=0; i<n; i++) {
1002     ir      = 2*r[i];
1003     x[ir]   = t[ii];
1004     x[ir+1] = t[ii+1];
1005     ii += 2;
1006   }
1007 
1008   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1009   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1010   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1012   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 #undef __FUNCT__
1017 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_newdatastruct"
1018 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
1019 {
1020   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1021   PetscErrorCode ierr;
1022   IS             iscol=a->col,isrow=a->row;
1023   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1024   const PetscInt *r,*c,*rout,*cout;
1025   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1026   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1027   MatScalar      *aa=a->a,*v;
1028   PetscScalar    s1,s2,x1,x2;
1029   PetscScalar    *x,*b,*t;
1030 
1031   PetscFunctionBegin;
1032   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1033   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1034   t = a->solve_work;
1035 
1036   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1037   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1038 
1039   /* copy b into temp work space according to permutation */
1040   for(i=0;i<n;i++){
1041     ii = bs*i; ic = bs*c[i];
1042     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1043   }
1044 
1045   /* forward solve the U^T */
1046   idx = 0;
1047   for (i=0; i<n; i++) {
1048     v     = aa + bs2*diag[i];
1049     /* multiply by the inverse of the block diagonal */
1050     x1 = t[idx];   x2 = t[1+idx];
1051     s1 = v[0]*x1  +  v[1]*x2;
1052     s2 = v[2]*x1  +  v[3]*x2;
1053     v -= bs2;
1054 
1055     vi    = aj + diag[i] - 1;
1056     nz    = diag[i] - diag[i+1] - 1;
1057     for(j=0;j>-nz;j--){
1058       oidx = bs*vi[j];
1059       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1060       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1061       v  -= bs2;
1062     }
1063     t[idx]   = s1;t[1+idx] = s2;
1064     idx += bs;
1065   }
1066   /* backward solve the L^T */
1067   for (i=n-1; i>=0; i--){
1068     v    = aa + bs2*ai[i];
1069     vi   = aj + ai[i];
1070     nz   = ai[i+1] - ai[i];
1071     idt  = bs*i;
1072     s1   = t[idt];  s2 = t[1+idt];
1073     for(j=0;j<nz;j++){
1074       idx   = bs*vi[j];
1075       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1076       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1077       v += bs2;
1078     }
1079   }
1080 
1081   /* copy t into x according to permutation */
1082   for(i=0;i<n;i++){
1083     ii = bs*i;  ir = bs*r[i];
1084     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1085   }
1086 
1087   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 #undef __FUNCT__
1096 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1097 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1098 {
1099   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1100   IS             iscol=a->col,isrow=a->row;
1101   PetscErrorCode ierr;
1102   const PetscInt *r,*c,*rout,*cout;
1103   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1104   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1105   MatScalar      *aa=a->a,*v;
1106   PetscScalar    s1,s2,s3,x1,x2,x3;
1107   PetscScalar    *x,*b,*t;
1108 
1109   PetscFunctionBegin;
1110   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1111   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1112   t  = a->solve_work;
1113 
1114   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1115   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1116 
1117   /* copy the b into temp work space according to permutation */
1118   ii = 0;
1119   for (i=0; i<n; i++) {
1120     ic      = 3*c[i];
1121     t[ii]   = b[ic];
1122     t[ii+1] = b[ic+1];
1123     t[ii+2] = b[ic+2];
1124     ii += 3;
1125   }
1126 
1127   /* forward solve the U^T */
1128   idx = 0;
1129   for (i=0; i<n; i++) {
1130 
1131     v     = aa + 9*diag[i];
1132     /* multiply by the inverse of the block diagonal */
1133     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1134     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1135     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1136     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1137     v += 9;
1138 
1139     vi    = aj + diag[i] + 1;
1140     nz    = ai[i+1] - diag[i] - 1;
1141     while (nz--) {
1142       oidx = 3*(*vi++);
1143       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1144       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1145       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1146       v  += 9;
1147     }
1148     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1149     idx += 3;
1150   }
1151   /* backward solve the L^T */
1152   for (i=n-1; i>=0; i--){
1153     v    = aa + 9*diag[i] - 9;
1154     vi   = aj + diag[i] - 1;
1155     nz   = diag[i] - ai[i];
1156     idt  = 3*i;
1157     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1158     while (nz--) {
1159       idx   = 3*(*vi--);
1160       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1161       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1162       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1163       v -= 9;
1164     }
1165   }
1166 
1167   /* copy t into x according to permutation */
1168   ii = 0;
1169   for (i=0; i<n; i++) {
1170     ir      = 3*r[i];
1171     x[ir]   = t[ii];
1172     x[ir+1] = t[ii+1];
1173     x[ir+2] = t[ii+2];
1174     ii += 3;
1175   }
1176 
1177   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1178   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1179   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1181   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_newdatastruct"
1187 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
1188 {
1189   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1190   PetscErrorCode ierr;
1191   IS             iscol=a->col,isrow=a->row;
1192   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1193   const PetscInt *r,*c,*rout,*cout;
1194   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1195   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1196   MatScalar      *aa=a->a,*v;
1197   PetscScalar    s1,s2,s3,x1,x2,x3;
1198   PetscScalar    *x,*b,*t;
1199 
1200   PetscFunctionBegin;
1201   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1202   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1203   t = a->solve_work;
1204 
1205   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1206   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1207 
1208   /* copy b into temp work space according to permutation */
1209   for(i=0;i<n;i++){
1210     ii = bs*i; ic = bs*c[i];
1211     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1212   }
1213 
1214   /* forward solve the U^T */
1215   idx = 0;
1216   for (i=0; i<n; i++) {
1217     v     = aa + bs2*diag[i];
1218     /* multiply by the inverse of the block diagonal */
1219     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1220     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1221     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1222     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1223     v -= bs2;
1224 
1225     vi    = aj + diag[i] - 1;
1226     nz    = diag[i] - diag[i+1] - 1;
1227     for(j=0;j>-nz;j--){
1228       oidx = bs*vi[j];
1229       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1230       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1231       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1232       v  -= bs2;
1233     }
1234     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1235     idx += bs;
1236   }
1237   /* backward solve the L^T */
1238   for (i=n-1; i>=0; i--){
1239     v    = aa + bs2*ai[i];
1240     vi   = aj + ai[i];
1241     nz   = ai[i+1] - ai[i];
1242     idt  = bs*i;
1243     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1244     for(j=0;j<nz;j++){
1245       idx   = bs*vi[j];
1246       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1247       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1248       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1249       v += bs2;
1250     }
1251   }
1252 
1253   /* copy t into x according to permutation */
1254   for(i=0;i<n;i++){
1255     ii = bs*i;  ir = bs*r[i];
1256     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1257   }
1258 
1259   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1260   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1261   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1263   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 #undef __FUNCT__
1268 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1269 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1270 {
1271   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1272   IS             iscol=a->col,isrow=a->row;
1273   PetscErrorCode ierr;
1274   const PetscInt *r,*c,*rout,*cout;
1275   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1276   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1277   MatScalar      *aa=a->a,*v;
1278   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1279   PetscScalar    *x,*b,*t;
1280 
1281   PetscFunctionBegin;
1282   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1283   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1284   t  = a->solve_work;
1285 
1286   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1287   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1288 
1289   /* copy the b into temp work space according to permutation */
1290   ii = 0;
1291   for (i=0; i<n; i++) {
1292     ic      = 4*c[i];
1293     t[ii]   = b[ic];
1294     t[ii+1] = b[ic+1];
1295     t[ii+2] = b[ic+2];
1296     t[ii+3] = b[ic+3];
1297     ii += 4;
1298   }
1299 
1300   /* forward solve the U^T */
1301   idx = 0;
1302   for (i=0; i<n; i++) {
1303 
1304     v     = aa + 16*diag[i];
1305     /* multiply by the inverse of the block diagonal */
1306     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1307     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1308     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1309     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1310     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1311     v += 16;
1312 
1313     vi    = aj + diag[i] + 1;
1314     nz    = ai[i+1] - diag[i] - 1;
1315     while (nz--) {
1316       oidx = 4*(*vi++);
1317       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1318       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1319       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1320       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1321       v  += 16;
1322     }
1323     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1324     idx += 4;
1325   }
1326   /* backward solve the L^T */
1327   for (i=n-1; i>=0; i--){
1328     v    = aa + 16*diag[i] - 16;
1329     vi   = aj + diag[i] - 1;
1330     nz   = diag[i] - ai[i];
1331     idt  = 4*i;
1332     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1333     while (nz--) {
1334       idx   = 4*(*vi--);
1335       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1336       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1337       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1338       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1339       v -= 16;
1340     }
1341   }
1342 
1343   /* copy t into x according to permutation */
1344   ii = 0;
1345   for (i=0; i<n; i++) {
1346     ir      = 4*r[i];
1347     x[ir]   = t[ii];
1348     x[ir+1] = t[ii+1];
1349     x[ir+2] = t[ii+2];
1350     x[ir+3] = t[ii+3];
1351     ii += 4;
1352   }
1353 
1354   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1355   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1356   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1358   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 #undef __FUNCT__
1363 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_newdatastruct"
1364 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
1365 {
1366   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1367   PetscErrorCode ierr;
1368   IS             iscol=a->col,isrow=a->row;
1369   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1370   const PetscInt *r,*c,*rout,*cout;
1371   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1372   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1373   MatScalar      *aa=a->a,*v;
1374   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1375   PetscScalar    *x,*b,*t;
1376 
1377   PetscFunctionBegin;
1378   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1379   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1380   t = a->solve_work;
1381 
1382   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1383   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1384 
1385   /* copy b into temp work space according to permutation */
1386   for(i=0;i<n;i++){
1387     ii = bs*i; ic = bs*c[i];
1388     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1389   }
1390 
1391   /* forward solve the U^T */
1392   idx = 0;
1393   for (i=0; i<n; i++) {
1394     v     = aa + bs2*diag[i];
1395     /* multiply by the inverse of the block diagonal */
1396     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1397     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1398     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1399     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1400     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1401     v -= bs2;
1402 
1403     vi    = aj + diag[i] - 1;
1404     nz    = diag[i] - diag[i+1] - 1;
1405     for(j=0;j>-nz;j--){
1406       oidx = bs*vi[j];
1407       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1408       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1409       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1410       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1411       v  -= bs2;
1412     }
1413     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1414     idx += bs;
1415   }
1416   /* backward solve the L^T */
1417   for (i=n-1; i>=0; i--){
1418     v    = aa + bs2*ai[i];
1419     vi   = aj + ai[i];
1420     nz   = ai[i+1] - ai[i];
1421     idt  = bs*i;
1422     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1423     for(j=0;j<nz;j++){
1424       idx   = bs*vi[j];
1425       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1426       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1427       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1428       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1429       v += bs2;
1430     }
1431   }
1432 
1433   /* copy t into x according to permutation */
1434   for(i=0;i<n;i++){
1435     ii = bs*i;  ir = bs*r[i];
1436     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1437   }
1438 
1439   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1440   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1443   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 #undef __FUNCT__
1448 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1449 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1450 {
1451   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1452   IS             iscol=a->col,isrow=a->row;
1453   PetscErrorCode ierr;
1454   const PetscInt *r,*c,*rout,*cout;
1455   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1456   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1457   MatScalar      *aa=a->a,*v;
1458   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1459   PetscScalar    *x,*b,*t;
1460 
1461   PetscFunctionBegin;
1462   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1463   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1464   t  = a->solve_work;
1465 
1466   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1467   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1468 
1469   /* copy the b into temp work space according to permutation */
1470   ii = 0;
1471   for (i=0; i<n; i++) {
1472     ic      = 5*c[i];
1473     t[ii]   = b[ic];
1474     t[ii+1] = b[ic+1];
1475     t[ii+2] = b[ic+2];
1476     t[ii+3] = b[ic+3];
1477     t[ii+4] = b[ic+4];
1478     ii += 5;
1479   }
1480 
1481   /* forward solve the U^T */
1482   idx = 0;
1483   for (i=0; i<n; i++) {
1484 
1485     v     = aa + 25*diag[i];
1486     /* multiply by the inverse of the block diagonal */
1487     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1488     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1489     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1490     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1491     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1492     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1493     v += 25;
1494 
1495     vi    = aj + diag[i] + 1;
1496     nz    = ai[i+1] - diag[i] - 1;
1497     while (nz--) {
1498       oidx = 5*(*vi++);
1499       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1500       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1501       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1502       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1503       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1504       v  += 25;
1505     }
1506     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1507     idx += 5;
1508   }
1509   /* backward solve the L^T */
1510   for (i=n-1; i>=0; i--){
1511     v    = aa + 25*diag[i] - 25;
1512     vi   = aj + diag[i] - 1;
1513     nz   = diag[i] - ai[i];
1514     idt  = 5*i;
1515     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1516     while (nz--) {
1517       idx   = 5*(*vi--);
1518       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1519       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1520       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1521       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1522       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1523       v -= 25;
1524     }
1525   }
1526 
1527   /* copy t into x according to permutation */
1528   ii = 0;
1529   for (i=0; i<n; i++) {
1530     ir      = 5*r[i];
1531     x[ir]   = t[ii];
1532     x[ir+1] = t[ii+1];
1533     x[ir+2] = t[ii+2];
1534     x[ir+3] = t[ii+3];
1535     x[ir+4] = t[ii+4];
1536     ii += 5;
1537   }
1538 
1539   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1540   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1541   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1542   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1543   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1544   PetscFunctionReturn(0);
1545 }
1546 
1547 #undef __FUNCT__
1548 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_newdatastruct"
1549 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
1550 {
1551   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1552   PetscErrorCode ierr;
1553   IS             iscol=a->col,isrow=a->row;
1554   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1555   const PetscInt *r,*c,*rout,*cout;
1556   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1557   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1558   MatScalar      *aa=a->a,*v;
1559   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1560   PetscScalar    *x,*b,*t;
1561 
1562   PetscFunctionBegin;
1563   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1565   t = a->solve_work;
1566 
1567   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1568   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1569 
1570   /* copy b into temp work space according to permutation */
1571   for(i=0;i<n;i++){
1572     ii = bs*i; ic = bs*c[i];
1573     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1574     t[ii+4] = b[ic+4];
1575   }
1576 
1577   /* forward solve the U^T */
1578   idx = 0;
1579   for (i=0; i<n; i++) {
1580     v     = aa + bs2*diag[i];
1581     /* multiply by the inverse of the block diagonal */
1582     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1583     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1584     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1585     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1586     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1587     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1588     v -= bs2;
1589 
1590     vi    = aj + diag[i] - 1;
1591     nz    = diag[i] - diag[i+1] - 1;
1592     for(j=0;j>-nz;j--){
1593       oidx = bs*vi[j];
1594       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1595       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1596       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1597       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1598       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1599       v  -= bs2;
1600     }
1601     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1602     idx += bs;
1603   }
1604   /* backward solve the L^T */
1605   for (i=n-1; i>=0; i--){
1606     v    = aa + bs2*ai[i];
1607     vi   = aj + ai[i];
1608     nz   = ai[i+1] - ai[i];
1609     idt  = bs*i;
1610     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1611     for(j=0;j<nz;j++){
1612       idx   = bs*vi[j];
1613       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1614       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1615       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1616       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1617       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1618       v += bs2;
1619     }
1620   }
1621 
1622   /* copy t into x according to permutation */
1623   for(i=0;i<n;i++){
1624     ii = bs*i;  ir = bs*r[i];
1625     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1626     x[ir+4] = t[ii+4];
1627   }
1628 
1629   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1630   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 #undef __FUNCT__
1638 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1639 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1640 {
1641   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1642   IS             iscol=a->col,isrow=a->row;
1643   PetscErrorCode ierr;
1644   const PetscInt *r,*c,*rout,*cout;
1645   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1646   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1647   MatScalar      *aa=a->a,*v;
1648   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1649   PetscScalar    *x,*b,*t;
1650 
1651   PetscFunctionBegin;
1652   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1653   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1654   t  = a->solve_work;
1655 
1656   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1657   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1658 
1659   /* copy the b into temp work space according to permutation */
1660   ii = 0;
1661   for (i=0; i<n; i++) {
1662     ic      = 6*c[i];
1663     t[ii]   = b[ic];
1664     t[ii+1] = b[ic+1];
1665     t[ii+2] = b[ic+2];
1666     t[ii+3] = b[ic+3];
1667     t[ii+4] = b[ic+4];
1668     t[ii+5] = b[ic+5];
1669     ii += 6;
1670   }
1671 
1672   /* forward solve the U^T */
1673   idx = 0;
1674   for (i=0; i<n; i++) {
1675 
1676     v     = aa + 36*diag[i];
1677     /* multiply by the inverse of the block diagonal */
1678     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1679     x6    = t[5+idx];
1680     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1681     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1682     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1683     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1684     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1685     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1686     v += 36;
1687 
1688     vi    = aj + diag[i] + 1;
1689     nz    = ai[i+1] - diag[i] - 1;
1690     while (nz--) {
1691       oidx = 6*(*vi++);
1692       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1693       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1694       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1695       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1696       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1697       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1698       v  += 36;
1699     }
1700     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1701     t[5+idx] = s6;
1702     idx += 6;
1703   }
1704   /* backward solve the L^T */
1705   for (i=n-1; i>=0; i--){
1706     v    = aa + 36*diag[i] - 36;
1707     vi   = aj + diag[i] - 1;
1708     nz   = diag[i] - ai[i];
1709     idt  = 6*i;
1710     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1711     s6 = t[5+idt];
1712     while (nz--) {
1713       idx   = 6*(*vi--);
1714       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1715       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1716       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1717       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1718       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1719       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1720       v -= 36;
1721     }
1722   }
1723 
1724   /* copy t into x according to permutation */
1725   ii = 0;
1726   for (i=0; i<n; i++) {
1727     ir      = 6*r[i];
1728     x[ir]   = t[ii];
1729     x[ir+1] = t[ii+1];
1730     x[ir+2] = t[ii+2];
1731     x[ir+3] = t[ii+3];
1732     x[ir+4] = t[ii+4];
1733     x[ir+5] = t[ii+5];
1734     ii += 6;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 #undef __FUNCT__
1746 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_newdatastruct"
1747 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
1748 {
1749   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1750   PetscErrorCode ierr;
1751   IS             iscol=a->col,isrow=a->row;
1752   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1753   const PetscInt *r,*c,*rout,*cout;
1754   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1755   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1756   MatScalar      *aa=a->a,*v;
1757   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1758   PetscScalar    *x,*b,*t;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763   t = a->solve_work;
1764 
1765   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1766   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1767 
1768   /* copy b into temp work space according to permutation */
1769   for(i=0;i<n;i++){
1770     ii = bs*i; ic = bs*c[i];
1771     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1772     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1773   }
1774 
1775   /* forward solve the U^T */
1776   idx = 0;
1777   for (i=0; i<n; i++) {
1778     v     = aa + bs2*diag[i];
1779     /* multiply by the inverse of the block diagonal */
1780     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1781     x6    = t[5+idx];
1782     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1783     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1784     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1785     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1786     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1787     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1788     v -= bs2;
1789 
1790     vi    = aj + diag[i] - 1;
1791     nz    = diag[i] - diag[i+1] - 1;
1792     for(j=0;j>-nz;j--){
1793       oidx = bs*vi[j];
1794       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1795       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1796       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1797       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1798       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1799       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1800       v  -= bs2;
1801     }
1802     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1803     t[5+idx] = s6;
1804     idx += bs;
1805   }
1806   /* backward solve the L^T */
1807   for (i=n-1; i>=0; i--){
1808     v    = aa + bs2*ai[i];
1809     vi   = aj + ai[i];
1810     nz   = ai[i+1] - ai[i];
1811     idt  = bs*i;
1812     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1813     s6   = t[5+idt];
1814    for(j=0;j<nz;j++){
1815       idx   = bs*vi[j];
1816       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1817       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1818       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1819       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1820       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1821       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1822       v += bs2;
1823     }
1824   }
1825 
1826   /* copy t into x according to permutation */
1827   for(i=0;i<n;i++){
1828     ii = bs*i;  ir = bs*r[i];
1829     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1830     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1831   }
1832 
1833   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1834   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1835   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1836   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1837   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 #undef __FUNCT__
1842 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1843 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1844 {
1845   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1846   IS             iscol=a->col,isrow=a->row;
1847   PetscErrorCode ierr;
1848   const PetscInt *r,*c,*rout,*cout;
1849   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1850   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1851   MatScalar      *aa=a->a,*v;
1852   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1853   PetscScalar    *x,*b,*t;
1854 
1855   PetscFunctionBegin;
1856   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1857   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1858   t  = a->solve_work;
1859 
1860   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1861   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1862 
1863   /* copy the b into temp work space according to permutation */
1864   ii = 0;
1865   for (i=0; i<n; i++) {
1866     ic      = 7*c[i];
1867     t[ii]   = b[ic];
1868     t[ii+1] = b[ic+1];
1869     t[ii+2] = b[ic+2];
1870     t[ii+3] = b[ic+3];
1871     t[ii+4] = b[ic+4];
1872     t[ii+5] = b[ic+5];
1873     t[ii+6] = b[ic+6];
1874     ii += 7;
1875   }
1876 
1877   /* forward solve the U^T */
1878   idx = 0;
1879   for (i=0; i<n; i++) {
1880 
1881     v     = aa + 49*diag[i];
1882     /* multiply by the inverse of the block diagonal */
1883     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1884     x6    = t[5+idx]; x7 = t[6+idx];
1885     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1886     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1887     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1888     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1889     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1890     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1891     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1892     v += 49;
1893 
1894     vi    = aj + diag[i] + 1;
1895     nz    = ai[i+1] - diag[i] - 1;
1896     while (nz--) {
1897       oidx = 7*(*vi++);
1898       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1899       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1900       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1901       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1902       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1903       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1904       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1905       v  += 49;
1906     }
1907     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1908     t[5+idx] = s6;t[6+idx] = s7;
1909     idx += 7;
1910   }
1911   /* backward solve the L^T */
1912   for (i=n-1; i>=0; i--){
1913     v    = aa + 49*diag[i] - 49;
1914     vi   = aj + diag[i] - 1;
1915     nz   = diag[i] - ai[i];
1916     idt  = 7*i;
1917     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1918     s6 = t[5+idt];s7 = t[6+idt];
1919     while (nz--) {
1920       idx   = 7*(*vi--);
1921       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1922       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1923       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1924       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1925       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1926       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1927       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1928       v -= 49;
1929     }
1930   }
1931 
1932   /* copy t into x according to permutation */
1933   ii = 0;
1934   for (i=0; i<n; i++) {
1935     ir      = 7*r[i];
1936     x[ir]   = t[ii];
1937     x[ir+1] = t[ii+1];
1938     x[ir+2] = t[ii+2];
1939     x[ir+3] = t[ii+3];
1940     x[ir+4] = t[ii+4];
1941     x[ir+5] = t[ii+5];
1942     x[ir+6] = t[ii+6];
1943     ii += 7;
1944   }
1945 
1946   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1947   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1948   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1950   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1951   PetscFunctionReturn(0);
1952 }
1953 #undef __FUNCT__
1954 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_newdatastruct"
1955 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
1956 {
1957   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1958   PetscErrorCode ierr;
1959   IS             iscol=a->col,isrow=a->row;
1960   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1961   const PetscInt *r,*c,*rout,*cout;
1962   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1963   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1964   MatScalar      *aa=a->a,*v;
1965   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1966   PetscScalar    *x,*b,*t;
1967 
1968   PetscFunctionBegin;
1969   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1970   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1971   t = a->solve_work;
1972 
1973   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1974   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1975 
1976   /* copy b into temp work space according to permutation */
1977   for(i=0;i<n;i++){
1978     ii = bs*i; ic = bs*c[i];
1979     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1980     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1981   }
1982 
1983   /* forward solve the U^T */
1984   idx = 0;
1985   for (i=0; i<n; i++) {
1986     v     = aa + bs2*diag[i];
1987     /* multiply by the inverse of the block diagonal */
1988     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1989     x6    = t[5+idx]; x7 = t[6+idx];
1990     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1991     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1992     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1993     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1994     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1995     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1996     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1997     v -= bs2;
1998 
1999     vi    = aj + diag[i] - 1;
2000     nz    = diag[i] - diag[i+1] - 1;
2001     for(j=0;j>-nz;j--){
2002       oidx = bs*vi[j];
2003       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2004       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2005       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2006       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2007       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2008       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2009       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2010       v  -= bs2;
2011     }
2012     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2013     t[5+idx] = s6;  t[6+idx] = s7;
2014     idx += bs;
2015   }
2016   /* backward solve the L^T */
2017   for (i=n-1; i>=0; i--){
2018     v    = aa + bs2*ai[i];
2019     vi   = aj + ai[i];
2020     nz   = ai[i+1] - ai[i];
2021     idt  = bs*i;
2022     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2023     s6   = t[5+idt];  s7 = t[6+idt];
2024    for(j=0;j<nz;j++){
2025       idx   = bs*vi[j];
2026       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2027       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2028       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2029       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2030       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2031       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2032       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2033       v += bs2;
2034     }
2035   }
2036 
2037   /* copy t into x according to permutation */
2038   for(i=0;i<n;i++){
2039     ii = bs*i;  ir = bs*r[i];
2040     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2041     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2042   }
2043 
2044   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2045   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2046   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2047   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2048   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2049   PetscFunctionReturn(0);
2050 }
2051 
2052 /* ----------------------------------------------------------- */
2053 #undef __FUNCT__
2054 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
2055 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2056 {
2057   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2058   IS             iscol=a->col,isrow=a->row;
2059   PetscErrorCode ierr;
2060   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2061   PetscInt       i,n=a->mbs;
2062   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
2063   MatScalar      *aa=a->a,*v;
2064   PetscScalar    *x,*b,*s,*t,*ls;
2065 
2066   PetscFunctionBegin;
2067   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2068   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2069   t  = a->solve_work;
2070 
2071   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2072   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2073 
2074   /* forward solve the lower triangular */
2075   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2076   for (i=1; i<n; i++) {
2077     v   = aa + bs2*ai[i];
2078     vi  = aj + ai[i];
2079     nz  = a->diag[i] - ai[i];
2080     s = t + bs*i;
2081     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2082     while (nz--) {
2083       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2084       v += bs2;
2085     }
2086   }
2087   /* backward solve the upper triangular */
2088   ls = a->solve_work + A->cmap->n;
2089   for (i=n-1; i>=0; i--){
2090     v   = aa + bs2*(a->diag[i] + 1);
2091     vi  = aj + a->diag[i] + 1;
2092     nz  = ai[i+1] - a->diag[i] - 1;
2093     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2094     while (nz--) {
2095       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2096       v += bs2;
2097     }
2098     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2099     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2100   }
2101 
2102   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2103   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2104   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2105   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2106   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2107   PetscFunctionReturn(0);
2108 }
2109 
2110 /* ----------------------------------------------------------- */
2111 #undef __FUNCT__
2112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2113 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2114 {
2115   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2116   IS                iscol=a->col,isrow=a->row;
2117   PetscErrorCode    ierr;
2118   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2119   PetscInt          i,n=a->mbs,j;
2120   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2121   const MatScalar   *aa=a->a,*v;
2122   PetscScalar       *x,*t,*ls;
2123   const PetscScalar *b;
2124   PetscFunctionBegin;
2125   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2126   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2127   t    = a->solve_work;
2128 
2129   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2130   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2131 
2132   /* copy the b into temp work space according to permutation */
2133   for (i=0; i<n; i++) {
2134     for (j=0; j<bs; j++) {
2135       t[i*bs+j] = b[c[i]*bs+j];
2136     }
2137   }
2138 
2139 
2140   /* forward solve the upper triangular transpose */
2141   ls = a->solve_work + A->cmap->n;
2142   for (i=0; i<n; i++){
2143     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2144     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2145     v   = aa + bs2*(a->diag[i] + 1);
2146     vi  = aj + a->diag[i] + 1;
2147     nz  = ai[i+1] - a->diag[i] - 1;
2148     while (nz--) {
2149       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2150       v += bs2;
2151     }
2152   }
2153 
2154   /* backward solve the lower triangular transpose */
2155   for (i=n-1; i>=0; i--) {
2156     v   = aa + bs2*ai[i];
2157     vi  = aj + ai[i];
2158     nz  = a->diag[i] - ai[i];
2159     while (nz--) {
2160       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2161       v += bs2;
2162     }
2163   }
2164 
2165   /* copy t into x according to permutation */
2166   for (i=0; i<n; i++) {
2167     for (j=0; j<bs; j++) {
2168       x[bs*r[i]+j]   = t[bs*i+j];
2169     }
2170   }
2171 
2172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2174   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2176   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2177   PetscFunctionReturn(0);
2178 }
2179 
2180 #undef __FUNCT__
2181 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct"
2182 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx)
2183 {
2184   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2185   IS                iscol=a->col,isrow=a->row;
2186   PetscErrorCode    ierr;
2187   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2188   PetscInt          i,n=a->mbs,j;
2189   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2190   const MatScalar   *aa=a->a,*v;
2191   PetscScalar       *x,*t,*ls;
2192   const PetscScalar *b;
2193   PetscFunctionBegin;
2194   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2195   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2196   t    = a->solve_work;
2197 
2198   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2199   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2200 
2201   /* copy the b into temp work space according to permutation */
2202   for (i=0; i<n; i++) {
2203     for (j=0; j<bs; j++) {
2204       t[i*bs+j] = b[c[i]*bs+j];
2205     }
2206   }
2207 
2208 
2209   /* forward solve the upper triangular transpose */
2210   ls = a->solve_work + A->cmap->n;
2211   for (i=0; i<n; i++){
2212     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2213     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2214     v   = aa + bs2*(diag[i] - 1);
2215     vi  = aj + diag[i] - 1;
2216     nz  = diag[i] - diag[i+1] - 1;
2217     for(j=0;j>-nz;j--){
2218       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2219       v -= bs2;
2220     }
2221   }
2222 
2223   /* backward solve the lower triangular transpose */
2224   for (i=n-1; i>=0; i--) {
2225     v   = aa + bs2*ai[i];
2226     vi  = aj + ai[i];
2227     nz  = ai[i+1] - ai[i];
2228     for(j=0;j<nz;j++){
2229       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2230       v += bs2;
2231     }
2232   }
2233 
2234   /* copy t into x according to permutation */
2235   for (i=0; i<n; i++) {
2236     for (j=0; j<bs; j++) {
2237       x[bs*r[i]+j]   = t[bs*i+j];
2238     }
2239   }
2240 
2241   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2242   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2243   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2244   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2245   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2246   PetscFunctionReturn(0);
2247 }
2248 
2249 #undef __FUNCT__
2250 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2251 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2252 {
2253   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2254   IS             iscol=a->col,isrow=a->row;
2255   PetscErrorCode ierr;
2256   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
2257   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
2258   MatScalar      *aa=a->a,*v;
2259   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2260   PetscScalar    *x,*b,*t;
2261 
2262   PetscFunctionBegin;
2263   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2264   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2265   t  = a->solve_work;
2266 
2267   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2268   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2269 
2270   /* forward solve the lower triangular */
2271   idx    = 7*(*r++);
2272   t[0] = b[idx];   t[1] = b[1+idx];
2273   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2274   t[5] = b[5+idx]; t[6] = b[6+idx];
2275 
2276   for (i=1; i<n; i++) {
2277     v     = aa + 49*ai[i];
2278     vi    = aj + ai[i];
2279     nz    = diag[i] - ai[i];
2280     idx   = 7*(*r++);
2281     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2282     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2283     while (nz--) {
2284       idx   = 7*(*vi++);
2285       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2286       x4    = t[3+idx];x5 = t[4+idx];
2287       x6    = t[5+idx];x7 = t[6+idx];
2288       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2289       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2290       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2291       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2292       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2293       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2294       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2295       v += 49;
2296     }
2297     idx = 7*i;
2298     t[idx]   = s1;t[1+idx] = s2;
2299     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2300     t[5+idx] = s6;t[6+idx] = s7;
2301   }
2302   /* backward solve the upper triangular */
2303   for (i=n-1; i>=0; i--){
2304     v    = aa + 49*diag[i] + 49;
2305     vi   = aj + diag[i] + 1;
2306     nz   = ai[i+1] - diag[i] - 1;
2307     idt  = 7*i;
2308     s1 = t[idt];  s2 = t[1+idt];
2309     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2310     s6 = t[5+idt];s7 = t[6+idt];
2311     while (nz--) {
2312       idx   = 7*(*vi++);
2313       x1    = t[idx];   x2 = t[1+idx];
2314       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2315       x6    = t[5+idx]; x7 = t[6+idx];
2316       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2317       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2318       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2319       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2320       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2321       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2322       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2323       v += 49;
2324     }
2325     idc = 7*(*c--);
2326     v   = aa + 49*diag[i];
2327     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2328                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2329     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2330                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2331     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2332                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2333     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2334                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2335     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2336                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2337     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2338                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2339     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2340                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2341   }
2342 
2343   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2344   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2345   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2346   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2347   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2348   PetscFunctionReturn(0);
2349 }
2350 
2351 #undef __FUNCT__
2352 #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
2353 PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
2354 {
2355   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2356   IS             iscol=a->col,isrow=a->row;
2357   PetscErrorCode ierr;
2358   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
2359   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
2360   MatScalar      *aa=a->a,*v;
2361   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2362   PetscScalar    *x,*b,*t;
2363 
2364   PetscFunctionBegin;
2365   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2366   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2367   t  = a->solve_work;
2368 
2369   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2370   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2371 
2372   /* forward solve the lower triangular */
2373   idx    = 7*r[0];
2374   t[0] = b[idx];   t[1] = b[1+idx];
2375   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2376   t[5] = b[5+idx]; t[6] = b[6+idx];
2377 
2378   for (i=1; i<n; i++) {
2379     v     = aa + 49*ai[i];
2380     vi    = aj + ai[i];
2381     nz    = ai[i+1] - ai[i];
2382     idx   = 7*r[i];
2383     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2384     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2385     for(m=0;m<nz;m++){
2386       idx   = 7*vi[m];
2387       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2388       x4    = t[3+idx];x5 = t[4+idx];
2389       x6    = t[5+idx];x7 = t[6+idx];
2390       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2391       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2392       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2393       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2394       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2395       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2396       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2397       v += 49;
2398     }
2399     idx = 7*i;
2400     t[idx]   = s1;t[1+idx] = s2;
2401     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2402     t[5+idx] = s6;t[6+idx] = s7;
2403   }
2404   /* backward solve the upper triangular */
2405   for (i=n-1; i>=0; i--){
2406     v    = aa + 49*(adiag[i+1]+1);
2407     vi   = aj + adiag[i+1]+1;
2408     nz   = adiag[i] - adiag[i+1] - 1;
2409     idt  = 7*i;
2410     s1 = t[idt];  s2 = t[1+idt];
2411     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2412     s6 = t[5+idt];s7 = t[6+idt];
2413     for(m=0;m<nz;m++){
2414       idx   = 7*vi[m];
2415       x1    = t[idx];   x2 = t[1+idx];
2416       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2417       x6    = t[5+idx]; x7 = t[6+idx];
2418       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2419       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2420       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2421       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2422       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2423       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2424       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2425       v += 49;
2426     }
2427     idc = 7*c[i];
2428     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2429                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2430     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2431                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2432     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2433                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2434     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2435                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2436     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2437                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2438     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2439                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2440     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2441                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2442   }
2443 
2444   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2445   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2446   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2447   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2448   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2449   PetscFunctionReturn(0);
2450 }
2451 
2452 #undef __FUNCT__
2453 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2454 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2455 {
2456   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2457   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2458   PetscErrorCode    ierr;
2459   PetscInt          *diag = a->diag,jdx;
2460   const MatScalar   *aa=a->a,*v;
2461   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2462   const PetscScalar *b;
2463 
2464   PetscFunctionBegin;
2465   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2466   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2467   /* forward solve the lower triangular */
2468   idx    = 0;
2469   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2470   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2471   x[6] = b[6+idx];
2472   for (i=1; i<n; i++) {
2473     v     =  aa + 49*ai[i];
2474     vi    =  aj + ai[i];
2475     nz    =  diag[i] - ai[i];
2476     idx   =  7*i;
2477     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2478     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2479     s7  =  b[6+idx];
2480     while (nz--) {
2481       jdx   = 7*(*vi++);
2482       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2483       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2484       x7    = x[6+jdx];
2485       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2486       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2487       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2488       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2489       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2490       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2491       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2492       v += 49;
2493      }
2494     x[idx]   = s1;
2495     x[1+idx] = s2;
2496     x[2+idx] = s3;
2497     x[3+idx] = s4;
2498     x[4+idx] = s5;
2499     x[5+idx] = s6;
2500     x[6+idx] = s7;
2501   }
2502   /* backward solve the upper triangular */
2503   for (i=n-1; i>=0; i--){
2504     v    = aa + 49*diag[i] + 49;
2505     vi   = aj + diag[i] + 1;
2506     nz   = ai[i+1] - diag[i] - 1;
2507     idt  = 7*i;
2508     s1 = x[idt];   s2 = x[1+idt];
2509     s3 = x[2+idt]; s4 = x[3+idt];
2510     s5 = x[4+idt]; s6 = x[5+idt];
2511     s7 = x[6+idt];
2512     while (nz--) {
2513       idx   = 7*(*vi++);
2514       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2515       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2516       x7    = x[6+idx];
2517       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2518       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2519       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2520       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2521       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2522       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2523       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2524       v += 49;
2525     }
2526     v        = aa + 49*diag[i];
2527     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2528                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2529     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2530                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2531     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2532                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2533     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2534                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2535     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2536                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2537     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2538                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2539     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2540                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2541   }
2542 
2543   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2544   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2545   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2546   PetscFunctionReturn(0);
2547 }
2548 
2549 #undef __FUNCT__
2550 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
2551 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2552 {
2553     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2554     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2555     PetscErrorCode    ierr;
2556     PetscInt          idx,jdx,idt;
2557     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2558     const MatScalar   *aa=a->a,*v;
2559     PetscScalar       *x;
2560     const PetscScalar *b;
2561     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2562 
2563     PetscFunctionBegin;
2564     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2565     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2566     /* forward solve the lower triangular */
2567     idx    = 0;
2568     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2569     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2570     for (i=1; i<n; i++) {
2571        v    = aa + bs2*ai[i];
2572        vi   = aj + ai[i];
2573        nz   = ai[i+1] - ai[i];
2574       idx   = bs*i;
2575        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2576        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2577        for(k=0;k<nz;k++) {
2578           jdx   = bs*vi[k];
2579           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2580 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2581           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2582           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2583           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2584 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2585           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2586 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2587 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2588           v   +=  bs2;
2589         }
2590 
2591        x[idx]   = s1;
2592        x[1+idx] = s2;
2593        x[2+idx] = s3;
2594        x[3+idx] = s4;
2595        x[4+idx] = s5;
2596        x[5+idx] = s6;
2597        x[6+idx] = s7;
2598     }
2599 
2600    /* backward solve the upper triangular */
2601   for (i=n-1; i>=0; i--){
2602     v   = aa + bs2*(adiag[i+1]+1);
2603      vi  = aj + adiag[i+1]+1;
2604      nz  = adiag[i] - adiag[i+1]-1;
2605      idt = bs*i;
2606      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2607      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2608     for(k=0;k<nz;k++) {
2609       idx   = bs*vi[k];
2610        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2611        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2612        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2613        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2614        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2615        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2616        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2617        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2618        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2619         v   +=  bs2;
2620     }
2621     /* x = inv_diagonal*x */
2622     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2623     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2624     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2625     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2626     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2627     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2628     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2629   }
2630 
2631   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2633   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2634   PetscFunctionReturn(0);
2635 }
2636 
2637 #undef __FUNCT__
2638 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2639 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2640 {
2641   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2642   IS                iscol=a->col,isrow=a->row;
2643   PetscErrorCode    ierr;
2644   const PetscInt    *r,*c,*rout,*cout;
2645   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2646   const MatScalar   *aa=a->a,*v;
2647   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2648   const PetscScalar *b;
2649   PetscFunctionBegin;
2650   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2651   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2652   t  = a->solve_work;
2653 
2654   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2655   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2656 
2657   /* forward solve the lower triangular */
2658   idx    = 6*(*r++);
2659   t[0] = b[idx];   t[1] = b[1+idx];
2660   t[2] = b[2+idx]; t[3] = b[3+idx];
2661   t[4] = b[4+idx]; t[5] = b[5+idx];
2662   for (i=1; i<n; i++) {
2663     v     = aa + 36*ai[i];
2664     vi    = aj + ai[i];
2665     nz    = diag[i] - ai[i];
2666     idx   = 6*(*r++);
2667     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2668     s5  = b[4+idx]; s6 = b[5+idx];
2669     while (nz--) {
2670       idx   = 6*(*vi++);
2671       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2672       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2673       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2674       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2675       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2676       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2677       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2678       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2679       v += 36;
2680     }
2681     idx = 6*i;
2682     t[idx]   = s1;t[1+idx] = s2;
2683     t[2+idx] = s3;t[3+idx] = s4;
2684     t[4+idx] = s5;t[5+idx] = s6;
2685   }
2686   /* backward solve the upper triangular */
2687   for (i=n-1; i>=0; i--){
2688     v    = aa + 36*diag[i] + 36;
2689     vi   = aj + diag[i] + 1;
2690     nz   = ai[i+1] - diag[i] - 1;
2691     idt  = 6*i;
2692     s1 = t[idt];  s2 = t[1+idt];
2693     s3 = t[2+idt];s4 = t[3+idt];
2694     s5 = t[4+idt];s6 = t[5+idt];
2695     while (nz--) {
2696       idx   = 6*(*vi++);
2697       x1    = t[idx];   x2 = t[1+idx];
2698       x3    = t[2+idx]; x4 = t[3+idx];
2699       x5    = t[4+idx]; x6 = t[5+idx];
2700       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2701       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2702       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2703       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2704       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2705       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2706       v += 36;
2707     }
2708     idc = 6*(*c--);
2709     v   = aa + 36*diag[i];
2710     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2711                                  v[18]*s4+v[24]*s5+v[30]*s6;
2712     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2713                                  v[19]*s4+v[25]*s5+v[31]*s6;
2714     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2715                                  v[20]*s4+v[26]*s5+v[32]*s6;
2716     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2717                                  v[21]*s4+v[27]*s5+v[33]*s6;
2718     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2719                                  v[22]*s4+v[28]*s5+v[34]*s6;
2720     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2721                                  v[23]*s4+v[29]*s5+v[35]*s6;
2722   }
2723 
2724   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2725   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2726   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2727   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2728   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2729   PetscFunctionReturn(0);
2730 }
2731 
2732 #undef __FUNCT__
2733 #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
2734 PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
2735 {
2736   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2737   IS                iscol=a->col,isrow=a->row;
2738   PetscErrorCode    ierr;
2739   const PetscInt    *r,*c,*rout,*cout;
2740   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2741   const MatScalar   *aa=a->a,*v;
2742   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2743   const PetscScalar *b;
2744   PetscFunctionBegin;
2745   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2746   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2747   t  = a->solve_work;
2748 
2749   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2750   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2751 
2752   /* forward solve the lower triangular */
2753   idx    = 6*r[0];
2754   t[0] = b[idx];   t[1] = b[1+idx];
2755   t[2] = b[2+idx]; t[3] = b[3+idx];
2756   t[4] = b[4+idx]; t[5] = b[5+idx];
2757   for (i=1; i<n; i++) {
2758     v     = aa + 36*ai[i];
2759     vi    = aj + ai[i];
2760     nz    = ai[i+1] - ai[i];
2761     idx   = 6*r[i];
2762     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2763     s5  = b[4+idx]; s6 = b[5+idx];
2764     for(m=0;m<nz;m++){
2765       idx   = 6*vi[m];
2766       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2767       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2768       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2769       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2770       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2771       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2772       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2773       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2774       v += 36;
2775     }
2776     idx = 6*i;
2777     t[idx]   = s1;t[1+idx] = s2;
2778     t[2+idx] = s3;t[3+idx] = s4;
2779     t[4+idx] = s5;t[5+idx] = s6;
2780   }
2781   /* backward solve the upper triangular */
2782   for (i=n-1; i>=0; i--){
2783     v    = aa + 36*(adiag[i+1]+1);
2784     vi   = aj + adiag[i+1]+1;
2785     nz   = adiag[i] - adiag[i+1] - 1;
2786     idt  = 6*i;
2787     s1 = t[idt];  s2 = t[1+idt];
2788     s3 = t[2+idt];s4 = t[3+idt];
2789     s5 = t[4+idt];s6 = t[5+idt];
2790     for(m=0;m<nz;m++){
2791       idx   = 6*vi[m];
2792       x1    = t[idx];   x2 = t[1+idx];
2793       x3    = t[2+idx]; x4 = t[3+idx];
2794       x5    = t[4+idx]; x6 = t[5+idx];
2795       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2796       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2797       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2798       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2799       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2800       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2801       v += 36;
2802     }
2803     idc = 6*c[i];
2804     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2805                                  v[18]*s4+v[24]*s5+v[30]*s6;
2806     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2807                                  v[19]*s4+v[25]*s5+v[31]*s6;
2808     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2809                                  v[20]*s4+v[26]*s5+v[32]*s6;
2810     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2811                                  v[21]*s4+v[27]*s5+v[33]*s6;
2812     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2813                                  v[22]*s4+v[28]*s5+v[34]*s6;
2814     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2815                                  v[23]*s4+v[29]*s5+v[35]*s6;
2816   }
2817 
2818   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2819   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2820   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2821   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2822   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2823   PetscFunctionReturn(0);
2824 }
2825 
2826 #undef __FUNCT__
2827 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2828 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
2829 {
2830   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2831   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2832   PetscErrorCode    ierr;
2833   PetscInt          *diag = a->diag,jdx;
2834   const MatScalar   *aa=a->a,*v;
2835   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2836   const PetscScalar *b;
2837 
2838   PetscFunctionBegin;
2839   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2840   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2841   /* forward solve the lower triangular */
2842   idx    = 0;
2843   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2844   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2845   for (i=1; i<n; i++) {
2846     v     =  aa + 36*ai[i];
2847     vi    =  aj + ai[i];
2848     nz    =  diag[i] - ai[i];
2849     idx   =  6*i;
2850     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2851     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2852     while (nz--) {
2853       jdx   = 6*(*vi++);
2854       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2855       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2856       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2857       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2858       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2859       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2860       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2861       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2862       v += 36;
2863      }
2864     x[idx]   = s1;
2865     x[1+idx] = s2;
2866     x[2+idx] = s3;
2867     x[3+idx] = s4;
2868     x[4+idx] = s5;
2869     x[5+idx] = s6;
2870   }
2871   /* backward solve the upper triangular */
2872   for (i=n-1; i>=0; i--){
2873     v    = aa + 36*diag[i] + 36;
2874     vi   = aj + diag[i] + 1;
2875     nz   = ai[i+1] - diag[i] - 1;
2876     idt  = 6*i;
2877     s1 = x[idt];   s2 = x[1+idt];
2878     s3 = x[2+idt]; s4 = x[3+idt];
2879     s5 = x[4+idt]; s6 = x[5+idt];
2880     while (nz--) {
2881       idx   = 6*(*vi++);
2882       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2883       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2884       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2885       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2886       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2887       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2888       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2889       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2890       v += 36;
2891     }
2892     v        = aa + 36*diag[i];
2893     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2894     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2895     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2896     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2897     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2898     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2899   }
2900 
2901   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2902   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2903   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2904   PetscFunctionReturn(0);
2905 }
2906 
2907 #undef __FUNCT__
2908 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2909 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2910 {
2911     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2912     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2913     PetscErrorCode    ierr;
2914     PetscInt          idx,jdx,idt;
2915     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2916     const MatScalar   *aa=a->a,*v;
2917     PetscScalar       *x;
2918     const PetscScalar *b;
2919     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2920 
2921     PetscFunctionBegin;
2922     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2923     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2924     /* forward solve the lower triangular */
2925     idx    = 0;
2926     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2927     x[4] = b[4+idx];x[5] = b[5+idx];
2928     for (i=1; i<n; i++) {
2929        v    = aa + bs2*ai[i];
2930        vi   = aj + ai[i];
2931        nz   = ai[i+1] - ai[i];
2932       idx   = bs*i;
2933        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2934        s5   = b[4+idx];s6 = b[5+idx];
2935        for(k=0;k<nz;k++){
2936           jdx   = bs*vi[k];
2937           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2938 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2939           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2940           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2941           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2942 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2943           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2944 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2945           v   +=  bs2;
2946         }
2947 
2948        x[idx]   = s1;
2949        x[1+idx] = s2;
2950        x[2+idx] = s3;
2951        x[3+idx] = s4;
2952        x[4+idx] = s5;
2953        x[5+idx] = s6;
2954     }
2955 
2956    /* backward solve the upper triangular */
2957   for (i=n-1; i>=0; i--){
2958     v   = aa + bs2*(adiag[i+1]+1);
2959      vi  = aj + adiag[i+1]+1;
2960      nz  = adiag[i] - adiag[i+1]-1;
2961      idt = bs*i;
2962      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2963      s5 = x[4+idt];s6 = x[5+idt];
2964      for(k=0;k<nz;k++){
2965       idx   = bs*vi[k];
2966        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2967        x5    = x[4+idx];x6 = x[5+idx];
2968        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2969        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2970        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2971        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2972        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2973        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2974         v   +=  bs2;
2975     }
2976     /* x = inv_diagonal*x */
2977    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2978    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2979    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2980    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2981    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2982    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2983   }
2984 
2985   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2986   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2987   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2988   PetscFunctionReturn(0);
2989 }
2990 
2991 #undef __FUNCT__
2992 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2993 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
2994 {
2995   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2996   IS                iscol=a->col,isrow=a->row;
2997   PetscErrorCode    ierr;
2998   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
2999   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3000   const MatScalar   *aa=a->a,*v;
3001   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3002   const PetscScalar *b;
3003 
3004   PetscFunctionBegin;
3005   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3006   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3007   t  = a->solve_work;
3008 
3009   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3010   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3011 
3012   /* forward solve the lower triangular */
3013   idx    = 5*(*r++);
3014   t[0] = b[idx];   t[1] = b[1+idx];
3015   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3016   for (i=1; i<n; i++) {
3017     v     = aa + 25*ai[i];
3018     vi    = aj + ai[i];
3019     nz    = diag[i] - ai[i];
3020     idx   = 5*(*r++);
3021     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3022     s5  = b[4+idx];
3023     while (nz--) {
3024       idx   = 5*(*vi++);
3025       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3026       x4    = t[3+idx];x5 = t[4+idx];
3027       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3028       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3029       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3030       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3031       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3032       v += 25;
3033     }
3034     idx = 5*i;
3035     t[idx]   = s1;t[1+idx] = s2;
3036     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3037   }
3038   /* backward solve the upper triangular */
3039   for (i=n-1; i>=0; i--){
3040     v    = aa + 25*diag[i] + 25;
3041     vi   = aj + diag[i] + 1;
3042     nz   = ai[i+1] - diag[i] - 1;
3043     idt  = 5*i;
3044     s1 = t[idt];  s2 = t[1+idt];
3045     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3046     while (nz--) {
3047       idx   = 5*(*vi++);
3048       x1    = t[idx];   x2 = t[1+idx];
3049       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3050       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3051       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3052       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3053       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3054       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3055       v += 25;
3056     }
3057     idc = 5*(*c--);
3058     v   = aa + 25*diag[i];
3059     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3060                                  v[15]*s4+v[20]*s5;
3061     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3062                                  v[16]*s4+v[21]*s5;
3063     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3064                                  v[17]*s4+v[22]*s5;
3065     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3066                                  v[18]*s4+v[23]*s5;
3067     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3068                                  v[19]*s4+v[24]*s5;
3069   }
3070 
3071   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3072   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3073   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3074   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3075   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3076   PetscFunctionReturn(0);
3077 }
3078 
3079 #undef __FUNCT__
3080 #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
3081 PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
3082 {
3083   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3084   IS                iscol=a->col,isrow=a->row;
3085   PetscErrorCode    ierr;
3086   const PetscInt    *r,*c,*rout,*cout;
3087   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3088   const MatScalar   *aa=a->a,*v;
3089   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3090   const PetscScalar *b;
3091 
3092   PetscFunctionBegin;
3093   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3094   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3095   t  = a->solve_work;
3096 
3097   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3098   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3099 
3100   /* forward solve the lower triangular */
3101   idx    = 5*r[0];
3102   t[0] = b[idx];   t[1] = b[1+idx];
3103   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3104   for (i=1; i<n; i++) {
3105     v     = aa + 25*ai[i];
3106     vi    = aj + ai[i];
3107     nz    = ai[i+1] - ai[i];
3108     idx   = 5*r[i];
3109     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3110     s5  = b[4+idx];
3111     for(m=0;m<nz;m++){
3112       idx   = 5*vi[m];
3113       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3114       x4    = t[3+idx];x5 = t[4+idx];
3115       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3116       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3117       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3118       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3119       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3120       v += 25;
3121     }
3122     idx = 5*i;
3123     t[idx]   = s1;t[1+idx] = s2;
3124     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3125   }
3126   /* backward solve the upper triangular */
3127   for (i=n-1; i>=0; i--){
3128     v    = aa + 25*(adiag[i+1]+1);
3129     vi   = aj + adiag[i+1]+1;
3130     nz   = adiag[i] - adiag[i+1] - 1;
3131     idt  = 5*i;
3132     s1 = t[idt];  s2 = t[1+idt];
3133     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3134     for(m=0;m<nz;m++){
3135       idx   = 5*vi[m];
3136       x1    = t[idx];   x2 = t[1+idx];
3137       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3138       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3139       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3140       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3141       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3142       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3143       v += 25;
3144     }
3145     idc = 5*c[i];
3146     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3147                                  v[15]*s4+v[20]*s5;
3148     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3149                                  v[16]*s4+v[21]*s5;
3150     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3151                                  v[17]*s4+v[22]*s5;
3152     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3153                                  v[18]*s4+v[23]*s5;
3154     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3155                                  v[19]*s4+v[24]*s5;
3156   }
3157 
3158   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3159   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3160   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3161   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3162   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3163   PetscFunctionReturn(0);
3164 }
3165 
3166 #undef __FUNCT__
3167 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3168 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3169 {
3170   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3171   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3172   PetscErrorCode    ierr;
3173   PetscInt          *diag = a->diag,jdx;
3174   const MatScalar   *aa=a->a,*v;
3175   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3176   const PetscScalar *b;
3177 
3178   PetscFunctionBegin;
3179   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3180   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3181   /* forward solve the lower triangular */
3182   idx    = 0;
3183   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3184   for (i=1; i<n; i++) {
3185     v     =  aa + 25*ai[i];
3186     vi    =  aj + ai[i];
3187     nz    =  diag[i] - ai[i];
3188     idx   =  5*i;
3189     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3190     while (nz--) {
3191       jdx   = 5*(*vi++);
3192       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3193       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3194       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3195       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3196       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3197       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3198       v    += 25;
3199     }
3200     x[idx]   = s1;
3201     x[1+idx] = s2;
3202     x[2+idx] = s3;
3203     x[3+idx] = s4;
3204     x[4+idx] = s5;
3205   }
3206   /* backward solve the upper triangular */
3207   for (i=n-1; i>=0; i--){
3208     v    = aa + 25*diag[i] + 25;
3209     vi   = aj + diag[i] + 1;
3210     nz   = ai[i+1] - diag[i] - 1;
3211     idt  = 5*i;
3212     s1 = x[idt];  s2 = x[1+idt];
3213     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3214     while (nz--) {
3215       idx   = 5*(*vi++);
3216       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3217       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3218       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3219       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3220       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3221       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3222       v    += 25;
3223     }
3224     v        = aa + 25*diag[i];
3225     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3226     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3227     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3228     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3229     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3230   }
3231 
3232   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3234   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3235   PetscFunctionReturn(0);
3236 }
3237 
3238 #undef __FUNCT__
3239 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
3240 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3241 {
3242   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3243   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
3244   PetscErrorCode    ierr;
3245   PetscInt          jdx;
3246   const MatScalar   *aa=a->a,*v;
3247   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3248   const PetscScalar *b;
3249 
3250   PetscFunctionBegin;
3251   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3252   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3253   /* forward solve the lower triangular */
3254   idx    = 0;
3255   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3256   for (i=1; i<n; i++) {
3257     v   = aa + 25*ai[i];
3258     vi  = aj + ai[i];
3259     nz  = ai[i+1] - ai[i];
3260     idx = 5*i;
3261     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3262     for(k=0;k<nz;k++) {
3263       jdx   = 5*vi[k];
3264       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3265       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3266       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3267       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3268       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3269       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3270       v    += 25;
3271     }
3272     x[idx]   = s1;
3273     x[1+idx] = s2;
3274     x[2+idx] = s3;
3275     x[3+idx] = s4;
3276     x[4+idx] = s5;
3277   }
3278 
3279   /* backward solve the upper triangular */
3280   for (i=n-1; i>=0; i--){
3281     v   = aa + 25*(adiag[i+1]+1);
3282     vi  = aj + adiag[i+1]+1;
3283     nz  = adiag[i] - adiag[i+1]-1;
3284     idt = 5*i;
3285     s1 = x[idt];  s2 = x[1+idt];
3286     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3287     for(k=0;k<nz;k++){
3288       idx   = 5*vi[k];
3289       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3290       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3291       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3292       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3293       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3294       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3295       v    += 25;
3296     }
3297     /* x = inv_diagonal*x */
3298     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3299     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3300     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3301     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3302     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3303   }
3304 
3305   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3306   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3307   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3308   PetscFunctionReturn(0);
3309 }
3310 
3311 #undef __FUNCT__
3312 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3313 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3314 {
3315   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3316   IS                iscol=a->col,isrow=a->row;
3317   PetscErrorCode    ierr;
3318   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3319   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3320   const MatScalar   *aa=a->a,*v;
3321   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3322   const PetscScalar *b;
3323 
3324   PetscFunctionBegin;
3325   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3326   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3327   t  = a->solve_work;
3328 
3329   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3330   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3331 
3332   /* forward solve the lower triangular */
3333   idx    = 4*(*r++);
3334   t[0] = b[idx];   t[1] = b[1+idx];
3335   t[2] = b[2+idx]; t[3] = b[3+idx];
3336   for (i=1; i<n; i++) {
3337     v     = aa + 16*ai[i];
3338     vi    = aj + ai[i];
3339     nz    = diag[i] - ai[i];
3340     idx   = 4*(*r++);
3341     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3342     while (nz--) {
3343       idx   = 4*(*vi++);
3344       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3345       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3346       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3347       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3348       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3349       v    += 16;
3350     }
3351     idx        = 4*i;
3352     t[idx]   = s1;t[1+idx] = s2;
3353     t[2+idx] = s3;t[3+idx] = s4;
3354   }
3355   /* backward solve the upper triangular */
3356   for (i=n-1; i>=0; i--){
3357     v    = aa + 16*diag[i] + 16;
3358     vi   = aj + diag[i] + 1;
3359     nz   = ai[i+1] - diag[i] - 1;
3360     idt  = 4*i;
3361     s1 = t[idt];  s2 = t[1+idt];
3362     s3 = t[2+idt];s4 = t[3+idt];
3363     while (nz--) {
3364       idx   = 4*(*vi++);
3365       x1    = t[idx];   x2 = t[1+idx];
3366       x3    = t[2+idx]; x4 = t[3+idx];
3367       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3368       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3369       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3370       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3371       v += 16;
3372     }
3373     idc      = 4*(*c--);
3374     v        = aa + 16*diag[i];
3375     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3376     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3377     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3378     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3379   }
3380 
3381   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3382   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3383   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3384   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3385   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3386   PetscFunctionReturn(0);
3387 }
3388 
3389 #undef __FUNCT__
3390 #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
3391 PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
3392 {
3393   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3394   IS                iscol=a->col,isrow=a->row;
3395   PetscErrorCode    ierr;
3396   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3397   const PetscInt    *r,*c,*rout,*cout;
3398   const MatScalar   *aa=a->a,*v;
3399   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3400   const PetscScalar *b;
3401 
3402   PetscFunctionBegin;
3403   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3404   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3405   t  = a->solve_work;
3406 
3407   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3408   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3409 
3410   /* forward solve the lower triangular */
3411   idx    = 4*r[0];
3412   t[0] = b[idx];   t[1] = b[1+idx];
3413   t[2] = b[2+idx]; t[3] = b[3+idx];
3414   for (i=1; i<n; i++) {
3415     v     = aa + 16*ai[i];
3416     vi    = aj + ai[i];
3417     nz    = ai[i+1] - ai[i];
3418     idx   = 4*r[i];
3419     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3420     for(m=0;m<nz;m++){
3421       idx   = 4*vi[m];
3422       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3423       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3424       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3425       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3426       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3427       v    += 16;
3428     }
3429     idx        = 4*i;
3430     t[idx]   = s1;t[1+idx] = s2;
3431     t[2+idx] = s3;t[3+idx] = s4;
3432   }
3433   /* backward solve the upper triangular */
3434   for (i=n-1; i>=0; i--){
3435     v    = aa + 16*(adiag[i+1]+1);
3436     vi   = aj + adiag[i+1]+1;
3437     nz   = adiag[i] - adiag[i+1] - 1;
3438     idt  = 4*i;
3439     s1 = t[idt];  s2 = t[1+idt];
3440     s3 = t[2+idt];s4 = t[3+idt];
3441     for(m=0;m<nz;m++){
3442       idx   = 4*vi[m];
3443       x1    = t[idx];   x2 = t[1+idx];
3444       x3    = t[2+idx]; x4 = t[3+idx];
3445       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3446       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3447       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3448       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3449       v += 16;
3450     }
3451     idc      = 4*c[i];
3452     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3453     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3454     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3455     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3456   }
3457 
3458   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3459   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3460   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3461   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3462   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3463   PetscFunctionReturn(0);
3464 }
3465 
3466 #undef __FUNCT__
3467 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3468 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3469 {
3470   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3471   IS                iscol=a->col,isrow=a->row;
3472   PetscErrorCode    ierr;
3473   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3474   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3475   const MatScalar   *aa=a->a,*v;
3476   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3477   PetscScalar       *x;
3478   const PetscScalar *b;
3479 
3480   PetscFunctionBegin;
3481   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3482   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3483   t  = (MatScalar *)a->solve_work;
3484 
3485   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3486   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3487 
3488   /* forward solve the lower triangular */
3489   idx    = 4*(*r++);
3490   t[0] = (MatScalar)b[idx];
3491   t[1] = (MatScalar)b[1+idx];
3492   t[2] = (MatScalar)b[2+idx];
3493   t[3] = (MatScalar)b[3+idx];
3494   for (i=1; i<n; i++) {
3495     v     = aa + 16*ai[i];
3496     vi    = aj + ai[i];
3497     nz    = diag[i] - ai[i];
3498     idx   = 4*(*r++);
3499     s1 = (MatScalar)b[idx];
3500     s2 = (MatScalar)b[1+idx];
3501     s3 = (MatScalar)b[2+idx];
3502     s4 = (MatScalar)b[3+idx];
3503     while (nz--) {
3504       idx   = 4*(*vi++);
3505       x1  = t[idx];
3506       x2  = t[1+idx];
3507       x3  = t[2+idx];
3508       x4  = t[3+idx];
3509       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3510       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3511       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3512       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3513       v    += 16;
3514     }
3515     idx        = 4*i;
3516     t[idx]   = s1;
3517     t[1+idx] = s2;
3518     t[2+idx] = s3;
3519     t[3+idx] = s4;
3520   }
3521   /* backward solve the upper triangular */
3522   for (i=n-1; i>=0; i--){
3523     v    = aa + 16*diag[i] + 16;
3524     vi   = aj + diag[i] + 1;
3525     nz   = ai[i+1] - diag[i] - 1;
3526     idt  = 4*i;
3527     s1 = t[idt];
3528     s2 = t[1+idt];
3529     s3 = t[2+idt];
3530     s4 = t[3+idt];
3531     while (nz--) {
3532       idx   = 4*(*vi++);
3533       x1  = t[idx];
3534       x2  = t[1+idx];
3535       x3  = t[2+idx];
3536       x4  = t[3+idx];
3537       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3538       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3539       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3540       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3541       v += 16;
3542     }
3543     idc      = 4*(*c--);
3544     v        = aa + 16*diag[i];
3545     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3546     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3547     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3548     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3549     x[idc]   = (PetscScalar)t[idt];
3550     x[1+idc] = (PetscScalar)t[1+idt];
3551     x[2+idc] = (PetscScalar)t[2+idt];
3552     x[3+idc] = (PetscScalar)t[3+idt];
3553  }
3554 
3555   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3556   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3557   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3558   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3559   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3560   PetscFunctionReturn(0);
3561 }
3562 
3563 #if defined (PETSC_HAVE_SSE)
3564 
3565 #include PETSC_HAVE_SSE
3566 
3567 #undef __FUNCT__
3568 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3569 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3570 {
3571   /*
3572      Note: This code uses demotion of double
3573      to float when performing the mixed-mode computation.
3574      This may not be numerically reasonable for all applications.
3575   */
3576   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3577   IS             iscol=a->col,isrow=a->row;
3578   PetscErrorCode ierr;
3579   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3580   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3581   MatScalar      *aa=a->a,*v;
3582   PetscScalar    *x,*b,*t;
3583 
3584   /* Make space in temp stack for 16 Byte Aligned arrays */
3585   float           ssealignedspace[11],*tmps,*tmpx;
3586   unsigned long   offset;
3587 
3588   PetscFunctionBegin;
3589   SSE_SCOPE_BEGIN;
3590 
3591     offset = (unsigned long)ssealignedspace % 16;
3592     if (offset) offset = (16 - offset)/4;
3593     tmps = &ssealignedspace[offset];
3594     tmpx = &ssealignedspace[offset+4];
3595     PREFETCH_NTA(aa+16*ai[1]);
3596 
3597     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3598     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3599     t  = a->solve_work;
3600 
3601     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3602     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3603 
3604     /* forward solve the lower triangular */
3605     idx  = 4*(*r++);
3606     t[0] = b[idx];   t[1] = b[1+idx];
3607     t[2] = b[2+idx]; t[3] = b[3+idx];
3608     v    =  aa + 16*ai[1];
3609 
3610     for (i=1; i<n;) {
3611       PREFETCH_NTA(&v[8]);
3612       vi   =  aj      + ai[i];
3613       nz   =  diag[i] - ai[i];
3614       idx  =  4*(*r++);
3615 
3616       /* Demote sum from double to float */
3617       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3618       LOAD_PS(tmps,XMM7);
3619 
3620       while (nz--) {
3621         PREFETCH_NTA(&v[16]);
3622         idx = 4*(*vi++);
3623 
3624         /* Demote solution (so far) from double to float */
3625         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3626 
3627         /* 4x4 Matrix-Vector product with negative accumulation: */
3628         SSE_INLINE_BEGIN_2(tmpx,v)
3629           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3630 
3631           /* First Column */
3632           SSE_COPY_PS(XMM0,XMM6)
3633           SSE_SHUFFLE(XMM0,XMM0,0x00)
3634           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3635           SSE_SUB_PS(XMM7,XMM0)
3636 
3637           /* Second Column */
3638           SSE_COPY_PS(XMM1,XMM6)
3639           SSE_SHUFFLE(XMM1,XMM1,0x55)
3640           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3641           SSE_SUB_PS(XMM7,XMM1)
3642 
3643           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3644 
3645           /* Third Column */
3646           SSE_COPY_PS(XMM2,XMM6)
3647           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3648           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3649           SSE_SUB_PS(XMM7,XMM2)
3650 
3651           /* Fourth Column */
3652           SSE_COPY_PS(XMM3,XMM6)
3653           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3654           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3655           SSE_SUB_PS(XMM7,XMM3)
3656         SSE_INLINE_END_2
3657 
3658         v  += 16;
3659       }
3660       idx = 4*i;
3661       v   = aa + 16*ai[++i];
3662       PREFETCH_NTA(v);
3663       STORE_PS(tmps,XMM7);
3664 
3665       /* Promote result from float to double */
3666       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3667     }
3668     /* backward solve the upper triangular */
3669     idt  = 4*(n-1);
3670     ai16 = 16*diag[n-1];
3671     v    = aa + ai16 + 16;
3672     for (i=n-1; i>=0;){
3673       PREFETCH_NTA(&v[8]);
3674       vi = aj + diag[i] + 1;
3675       nz = ai[i+1] - diag[i] - 1;
3676 
3677       /* Demote accumulator from double to float */
3678       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3679       LOAD_PS(tmps,XMM7);
3680 
3681       while (nz--) {
3682         PREFETCH_NTA(&v[16]);
3683         idx = 4*(*vi++);
3684 
3685         /* Demote solution (so far) from double to float */
3686         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3687 
3688         /* 4x4 Matrix-Vector Product with negative accumulation: */
3689         SSE_INLINE_BEGIN_2(tmpx,v)
3690           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3691 
3692           /* First Column */
3693           SSE_COPY_PS(XMM0,XMM6)
3694           SSE_SHUFFLE(XMM0,XMM0,0x00)
3695           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3696           SSE_SUB_PS(XMM7,XMM0)
3697 
3698           /* Second Column */
3699           SSE_COPY_PS(XMM1,XMM6)
3700           SSE_SHUFFLE(XMM1,XMM1,0x55)
3701           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3702           SSE_SUB_PS(XMM7,XMM1)
3703 
3704           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3705 
3706           /* Third Column */
3707           SSE_COPY_PS(XMM2,XMM6)
3708           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3709           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3710           SSE_SUB_PS(XMM7,XMM2)
3711 
3712           /* Fourth Column */
3713           SSE_COPY_PS(XMM3,XMM6)
3714           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3715           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3716           SSE_SUB_PS(XMM7,XMM3)
3717         SSE_INLINE_END_2
3718         v  += 16;
3719       }
3720       v    = aa + ai16;
3721       ai16 = 16*diag[--i];
3722       PREFETCH_NTA(aa+ai16+16);
3723       /*
3724          Scale the result by the diagonal 4x4 block,
3725          which was inverted as part of the factorization
3726       */
3727       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3728         /* First Column */
3729         SSE_COPY_PS(XMM0,XMM7)
3730         SSE_SHUFFLE(XMM0,XMM0,0x00)
3731         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3732 
3733         /* Second Column */
3734         SSE_COPY_PS(XMM1,XMM7)
3735         SSE_SHUFFLE(XMM1,XMM1,0x55)
3736         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3737         SSE_ADD_PS(XMM0,XMM1)
3738 
3739         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3740 
3741         /* Third Column */
3742         SSE_COPY_PS(XMM2,XMM7)
3743         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3744         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3745         SSE_ADD_PS(XMM0,XMM2)
3746 
3747         /* Fourth Column */
3748         SSE_COPY_PS(XMM3,XMM7)
3749         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3750         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3751         SSE_ADD_PS(XMM0,XMM3)
3752 
3753         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3754       SSE_INLINE_END_3
3755 
3756       /* Promote solution from float to double */
3757       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3758 
3759       /* Apply reordering to t and stream into x.    */
3760       /* This way, x doesn't pollute the cache.      */
3761       /* Be careful with size: 2 doubles = 4 floats! */
3762       idc  = 4*(*c--);
3763       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3764         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3765         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3766         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3767         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3768         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3769         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3770       SSE_INLINE_END_2
3771       v    = aa + ai16 + 16;
3772       idt -= 4;
3773     }
3774 
3775     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3776     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3777     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3778     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3779     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3780   SSE_SCOPE_END;
3781   PetscFunctionReturn(0);
3782 }
3783 
3784 #endif
3785 
3786 
3787 /*
3788       Special case where the matrix was ILU(0) factored in the natural
3789    ordering. This eliminates the need for the column and row permutation.
3790 */
3791 #undef __FUNCT__
3792 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3793 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3794 {
3795   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3796   PetscInt          n=a->mbs;
3797   const PetscInt    *ai=a->i,*aj=a->j;
3798   PetscErrorCode    ierr;
3799   const PetscInt    *diag = a->diag;
3800   const MatScalar   *aa=a->a;
3801   PetscScalar       *x;
3802   const PetscScalar *b;
3803 
3804   PetscFunctionBegin;
3805   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3806   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3807 
3808 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3809   {
3810     static PetscScalar w[2000]; /* very BAD need to fix */
3811     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3812   }
3813 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3814   {
3815     static PetscScalar w[2000]; /* very BAD need to fix */
3816     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3817   }
3818 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3819   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3820 #else
3821   {
3822     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3823     const MatScalar *v;
3824     PetscInt        jdx,idt,idx,nz,i,ai16;
3825     const PetscInt  *vi;
3826 
3827   /* forward solve the lower triangular */
3828   idx    = 0;
3829   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3830   for (i=1; i<n; i++) {
3831     v     =  aa      + 16*ai[i];
3832     vi    =  aj      + ai[i];
3833     nz    =  diag[i] - ai[i];
3834     idx   +=  4;
3835     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3836     while (nz--) {
3837       jdx   = 4*(*vi++);
3838       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3839       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3840       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3841       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3842       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3843       v    += 16;
3844     }
3845     x[idx]   = s1;
3846     x[1+idx] = s2;
3847     x[2+idx] = s3;
3848     x[3+idx] = s4;
3849   }
3850   /* backward solve the upper triangular */
3851   idt = 4*(n-1);
3852   for (i=n-1; i>=0; i--){
3853     ai16 = 16*diag[i];
3854     v    = aa + ai16 + 16;
3855     vi   = aj + diag[i] + 1;
3856     nz   = ai[i+1] - diag[i] - 1;
3857     s1 = x[idt];  s2 = x[1+idt];
3858     s3 = x[2+idt];s4 = x[3+idt];
3859     while (nz--) {
3860       idx   = 4*(*vi++);
3861       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3862       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3863       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3864       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3865       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3866       v    += 16;
3867     }
3868     v        = aa + ai16;
3869     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3870     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3871     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3872     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3873     idt -= 4;
3874   }
3875   }
3876 #endif
3877 
3878   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3879   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3880   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3881   PetscFunctionReturn(0);
3882 }
3883 
3884 #undef __FUNCT__
3885 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3886 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3887 {
3888     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3889     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3890     PetscErrorCode    ierr;
3891     PetscInt          idx,jdx,idt;
3892     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3893     const MatScalar   *aa=a->a,*v;
3894     PetscScalar       *x;
3895     const PetscScalar *b;
3896     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3897 
3898     PetscFunctionBegin;
3899     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3900     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3901     /* forward solve the lower triangular */
3902     idx    = 0;
3903     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3904     for (i=1; i<n; i++) {
3905        v    = aa + bs2*ai[i];
3906        vi   = aj + ai[i];
3907        nz   = ai[i+1] - ai[i];
3908       idx   = bs*i;
3909        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3910       for(k=0;k<nz;k++) {
3911           jdx   = bs*vi[k];
3912           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3913           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3914           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3915           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3916 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3917 
3918           v   +=  bs2;
3919         }
3920 
3921        x[idx]   = s1;
3922        x[1+idx] = s2;
3923        x[2+idx] = s3;
3924        x[3+idx] = s4;
3925     }
3926 
3927    /* backward solve the upper triangular */
3928   for (i=n-1; i>=0; i--){
3929     v   = aa + bs2*(adiag[i+1]+1);
3930      vi  = aj + adiag[i+1]+1;
3931      nz  = adiag[i] - adiag[i+1]-1;
3932      idt = bs*i;
3933      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3934 
3935     for(k=0;k<nz;k++){
3936       idx   = bs*vi[k];
3937        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3938        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3939        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3940        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3941        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3942 
3943         v   +=  bs2;
3944     }
3945     /* x = inv_diagonal*x */
3946    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3947    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3948    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3949    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3950 
3951   }
3952 
3953   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3954   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3955   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3956   PetscFunctionReturn(0);
3957 }
3958 
3959 #undef __FUNCT__
3960 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3961 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3962 {
3963   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3964   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3965   PetscErrorCode ierr;
3966   PetscInt       *diag = a->diag;
3967   MatScalar      *aa=a->a;
3968   PetscScalar    *x,*b;
3969 
3970   PetscFunctionBegin;
3971   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3972   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3973 
3974   {
3975     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3976     MatScalar  *v,*t=(MatScalar *)x;
3977     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3978 
3979     /* forward solve the lower triangular */
3980     idx  = 0;
3981     t[0] = (MatScalar)b[0];
3982     t[1] = (MatScalar)b[1];
3983     t[2] = (MatScalar)b[2];
3984     t[3] = (MatScalar)b[3];
3985     for (i=1; i<n; i++) {
3986       v     =  aa      + 16*ai[i];
3987       vi    =  aj      + ai[i];
3988       nz    =  diag[i] - ai[i];
3989       idx   +=  4;
3990       s1 = (MatScalar)b[idx];
3991       s2 = (MatScalar)b[1+idx];
3992       s3 = (MatScalar)b[2+idx];
3993       s4 = (MatScalar)b[3+idx];
3994       while (nz--) {
3995         jdx = 4*(*vi++);
3996         x1  = t[jdx];
3997         x2  = t[1+jdx];
3998         x3  = t[2+jdx];
3999         x4  = t[3+jdx];
4000         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4001         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4002         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4003         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4004         v    += 16;
4005       }
4006       t[idx]   = s1;
4007       t[1+idx] = s2;
4008       t[2+idx] = s3;
4009       t[3+idx] = s4;
4010     }
4011     /* backward solve the upper triangular */
4012     idt = 4*(n-1);
4013     for (i=n-1; i>=0; i--){
4014       ai16 = 16*diag[i];
4015       v    = aa + ai16 + 16;
4016       vi   = aj + diag[i] + 1;
4017       nz   = ai[i+1] - diag[i] - 1;
4018       s1   = t[idt];
4019       s2   = t[1+idt];
4020       s3   = t[2+idt];
4021       s4   = t[3+idt];
4022       while (nz--) {
4023         idx = 4*(*vi++);
4024         x1  = (MatScalar)x[idx];
4025         x2  = (MatScalar)x[1+idx];
4026         x3  = (MatScalar)x[2+idx];
4027         x4  = (MatScalar)x[3+idx];
4028         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4029         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4030         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4031         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4032         v    += 16;
4033       }
4034       v        = aa + ai16;
4035       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4036       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4037       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4038       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4039       idt -= 4;
4040     }
4041   }
4042 
4043   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4044   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4045   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4046   PetscFunctionReturn(0);
4047 }
4048 
4049 #if defined (PETSC_HAVE_SSE)
4050 
4051 #include PETSC_HAVE_SSE
4052 #undef __FUNCT__
4053 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4054 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4055 {
4056   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4057   unsigned short *aj=(unsigned short *)a->j;
4058   PetscErrorCode ierr;
4059   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4060   MatScalar      *aa=a->a;
4061   PetscScalar    *x,*b;
4062 
4063   PetscFunctionBegin;
4064   SSE_SCOPE_BEGIN;
4065   /*
4066      Note: This code currently uses demotion of double
4067      to float when performing the mixed-mode computation.
4068      This may not be numerically reasonable for all applications.
4069   */
4070   PREFETCH_NTA(aa+16*ai[1]);
4071 
4072   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4073   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4074   {
4075     /* x will first be computed in single precision then promoted inplace to double */
4076     MatScalar      *v,*t=(MatScalar *)x;
4077     int            nz,i,idt,ai16;
4078     unsigned int   jdx,idx;
4079     unsigned short *vi;
4080     /* Forward solve the lower triangular factor. */
4081 
4082     /* First block is the identity. */
4083     idx  = 0;
4084     CONVERT_DOUBLE4_FLOAT4(t,b);
4085     v    =  aa + 16*((unsigned int)ai[1]);
4086 
4087     for (i=1; i<n;) {
4088       PREFETCH_NTA(&v[8]);
4089       vi   =  aj      + ai[i];
4090       nz   =  diag[i] - ai[i];
4091       idx +=  4;
4092 
4093       /* Demote RHS from double to float. */
4094       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4095       LOAD_PS(&t[idx],XMM7);
4096 
4097       while (nz--) {
4098         PREFETCH_NTA(&v[16]);
4099         jdx = 4*((unsigned int)(*vi++));
4100 
4101         /* 4x4 Matrix-Vector product with negative accumulation: */
4102         SSE_INLINE_BEGIN_2(&t[jdx],v)
4103           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4104 
4105           /* First Column */
4106           SSE_COPY_PS(XMM0,XMM6)
4107           SSE_SHUFFLE(XMM0,XMM0,0x00)
4108           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4109           SSE_SUB_PS(XMM7,XMM0)
4110 
4111           /* Second Column */
4112           SSE_COPY_PS(XMM1,XMM6)
4113           SSE_SHUFFLE(XMM1,XMM1,0x55)
4114           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4115           SSE_SUB_PS(XMM7,XMM1)
4116 
4117           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4118 
4119           /* Third Column */
4120           SSE_COPY_PS(XMM2,XMM6)
4121           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4122           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4123           SSE_SUB_PS(XMM7,XMM2)
4124 
4125           /* Fourth Column */
4126           SSE_COPY_PS(XMM3,XMM6)
4127           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4128           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4129           SSE_SUB_PS(XMM7,XMM3)
4130         SSE_INLINE_END_2
4131 
4132         v  += 16;
4133       }
4134       v    =  aa + 16*ai[++i];
4135       PREFETCH_NTA(v);
4136       STORE_PS(&t[idx],XMM7);
4137     }
4138 
4139     /* Backward solve the upper triangular factor.*/
4140 
4141     idt  = 4*(n-1);
4142     ai16 = 16*diag[n-1];
4143     v    = aa + ai16 + 16;
4144     for (i=n-1; i>=0;){
4145       PREFETCH_NTA(&v[8]);
4146       vi = aj + diag[i] + 1;
4147       nz = ai[i+1] - diag[i] - 1;
4148 
4149       LOAD_PS(&t[idt],XMM7);
4150 
4151       while (nz--) {
4152         PREFETCH_NTA(&v[16]);
4153         idx = 4*((unsigned int)(*vi++));
4154 
4155         /* 4x4 Matrix-Vector Product with negative accumulation: */
4156         SSE_INLINE_BEGIN_2(&t[idx],v)
4157           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4158 
4159           /* First Column */
4160           SSE_COPY_PS(XMM0,XMM6)
4161           SSE_SHUFFLE(XMM0,XMM0,0x00)
4162           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4163           SSE_SUB_PS(XMM7,XMM0)
4164 
4165           /* Second Column */
4166           SSE_COPY_PS(XMM1,XMM6)
4167           SSE_SHUFFLE(XMM1,XMM1,0x55)
4168           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4169           SSE_SUB_PS(XMM7,XMM1)
4170 
4171           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4172 
4173           /* Third Column */
4174           SSE_COPY_PS(XMM2,XMM6)
4175           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4176           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4177           SSE_SUB_PS(XMM7,XMM2)
4178 
4179           /* Fourth Column */
4180           SSE_COPY_PS(XMM3,XMM6)
4181           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4182           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4183           SSE_SUB_PS(XMM7,XMM3)
4184         SSE_INLINE_END_2
4185         v  += 16;
4186       }
4187       v    = aa + ai16;
4188       ai16 = 16*diag[--i];
4189       PREFETCH_NTA(aa+ai16+16);
4190       /*
4191          Scale the result by the diagonal 4x4 block,
4192          which was inverted as part of the factorization
4193       */
4194       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4195         /* First Column */
4196         SSE_COPY_PS(XMM0,XMM7)
4197         SSE_SHUFFLE(XMM0,XMM0,0x00)
4198         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4199 
4200         /* Second Column */
4201         SSE_COPY_PS(XMM1,XMM7)
4202         SSE_SHUFFLE(XMM1,XMM1,0x55)
4203         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4204         SSE_ADD_PS(XMM0,XMM1)
4205 
4206         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4207 
4208         /* Third Column */
4209         SSE_COPY_PS(XMM2,XMM7)
4210         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4211         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4212         SSE_ADD_PS(XMM0,XMM2)
4213 
4214         /* Fourth Column */
4215         SSE_COPY_PS(XMM3,XMM7)
4216         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4217         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4218         SSE_ADD_PS(XMM0,XMM3)
4219 
4220         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4221       SSE_INLINE_END_3
4222 
4223       v    = aa + ai16 + 16;
4224       idt -= 4;
4225     }
4226 
4227     /* Convert t from single precision back to double precision (inplace)*/
4228     idt = 4*(n-1);
4229     for (i=n-1;i>=0;i--) {
4230       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4231       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4232       PetscScalar *xtemp=&x[idt];
4233       MatScalar   *ttemp=&t[idt];
4234       xtemp[3] = (PetscScalar)ttemp[3];
4235       xtemp[2] = (PetscScalar)ttemp[2];
4236       xtemp[1] = (PetscScalar)ttemp[1];
4237       xtemp[0] = (PetscScalar)ttemp[0];
4238       idt -= 4;
4239     }
4240 
4241   } /* End of artificial scope. */
4242   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4243   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4244   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4245   SSE_SCOPE_END;
4246   PetscFunctionReturn(0);
4247 }
4248 
4249 #undef __FUNCT__
4250 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4251 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4252 {
4253   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4254   int            *aj=a->j;
4255   PetscErrorCode ierr;
4256   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4257   MatScalar      *aa=a->a;
4258   PetscScalar    *x,*b;
4259 
4260   PetscFunctionBegin;
4261   SSE_SCOPE_BEGIN;
4262   /*
4263      Note: This code currently uses demotion of double
4264      to float when performing the mixed-mode computation.
4265      This may not be numerically reasonable for all applications.
4266   */
4267   PREFETCH_NTA(aa+16*ai[1]);
4268 
4269   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4270   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4271   {
4272     /* x will first be computed in single precision then promoted inplace to double */
4273     MatScalar *v,*t=(MatScalar *)x;
4274     int       nz,i,idt,ai16;
4275     int       jdx,idx;
4276     int       *vi;
4277     /* Forward solve the lower triangular factor. */
4278 
4279     /* First block is the identity. */
4280     idx  = 0;
4281     CONVERT_DOUBLE4_FLOAT4(t,b);
4282     v    =  aa + 16*ai[1];
4283 
4284     for (i=1; i<n;) {
4285       PREFETCH_NTA(&v[8]);
4286       vi   =  aj      + ai[i];
4287       nz   =  diag[i] - ai[i];
4288       idx +=  4;
4289 
4290       /* Demote RHS from double to float. */
4291       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4292       LOAD_PS(&t[idx],XMM7);
4293 
4294       while (nz--) {
4295         PREFETCH_NTA(&v[16]);
4296         jdx = 4*(*vi++);
4297 /*          jdx = *vi++; */
4298 
4299         /* 4x4 Matrix-Vector product with negative accumulation: */
4300         SSE_INLINE_BEGIN_2(&t[jdx],v)
4301           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4302 
4303           /* First Column */
4304           SSE_COPY_PS(XMM0,XMM6)
4305           SSE_SHUFFLE(XMM0,XMM0,0x00)
4306           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4307           SSE_SUB_PS(XMM7,XMM0)
4308 
4309           /* Second Column */
4310           SSE_COPY_PS(XMM1,XMM6)
4311           SSE_SHUFFLE(XMM1,XMM1,0x55)
4312           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4313           SSE_SUB_PS(XMM7,XMM1)
4314 
4315           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4316 
4317           /* Third Column */
4318           SSE_COPY_PS(XMM2,XMM6)
4319           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4320           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4321           SSE_SUB_PS(XMM7,XMM2)
4322 
4323           /* Fourth Column */
4324           SSE_COPY_PS(XMM3,XMM6)
4325           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4326           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4327           SSE_SUB_PS(XMM7,XMM3)
4328         SSE_INLINE_END_2
4329 
4330         v  += 16;
4331       }
4332       v    =  aa + 16*ai[++i];
4333       PREFETCH_NTA(v);
4334       STORE_PS(&t[idx],XMM7);
4335     }
4336 
4337     /* Backward solve the upper triangular factor.*/
4338 
4339     idt  = 4*(n-1);
4340     ai16 = 16*diag[n-1];
4341     v    = aa + ai16 + 16;
4342     for (i=n-1; i>=0;){
4343       PREFETCH_NTA(&v[8]);
4344       vi = aj + diag[i] + 1;
4345       nz = ai[i+1] - diag[i] - 1;
4346 
4347       LOAD_PS(&t[idt],XMM7);
4348 
4349       while (nz--) {
4350         PREFETCH_NTA(&v[16]);
4351         idx = 4*(*vi++);
4352 /*          idx = *vi++; */
4353 
4354         /* 4x4 Matrix-Vector Product with negative accumulation: */
4355         SSE_INLINE_BEGIN_2(&t[idx],v)
4356           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4357 
4358           /* First Column */
4359           SSE_COPY_PS(XMM0,XMM6)
4360           SSE_SHUFFLE(XMM0,XMM0,0x00)
4361           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4362           SSE_SUB_PS(XMM7,XMM0)
4363 
4364           /* Second Column */
4365           SSE_COPY_PS(XMM1,XMM6)
4366           SSE_SHUFFLE(XMM1,XMM1,0x55)
4367           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4368           SSE_SUB_PS(XMM7,XMM1)
4369 
4370           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4371 
4372           /* Third Column */
4373           SSE_COPY_PS(XMM2,XMM6)
4374           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4375           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4376           SSE_SUB_PS(XMM7,XMM2)
4377 
4378           /* Fourth Column */
4379           SSE_COPY_PS(XMM3,XMM6)
4380           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4381           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4382           SSE_SUB_PS(XMM7,XMM3)
4383         SSE_INLINE_END_2
4384         v  += 16;
4385       }
4386       v    = aa + ai16;
4387       ai16 = 16*diag[--i];
4388       PREFETCH_NTA(aa+ai16+16);
4389       /*
4390          Scale the result by the diagonal 4x4 block,
4391          which was inverted as part of the factorization
4392       */
4393       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4394         /* First Column */
4395         SSE_COPY_PS(XMM0,XMM7)
4396         SSE_SHUFFLE(XMM0,XMM0,0x00)
4397         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4398 
4399         /* Second Column */
4400         SSE_COPY_PS(XMM1,XMM7)
4401         SSE_SHUFFLE(XMM1,XMM1,0x55)
4402         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4403         SSE_ADD_PS(XMM0,XMM1)
4404 
4405         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4406 
4407         /* Third Column */
4408         SSE_COPY_PS(XMM2,XMM7)
4409         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4410         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4411         SSE_ADD_PS(XMM0,XMM2)
4412 
4413         /* Fourth Column */
4414         SSE_COPY_PS(XMM3,XMM7)
4415         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4416         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4417         SSE_ADD_PS(XMM0,XMM3)
4418 
4419         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4420       SSE_INLINE_END_3
4421 
4422       v    = aa + ai16 + 16;
4423       idt -= 4;
4424     }
4425 
4426     /* Convert t from single precision back to double precision (inplace)*/
4427     idt = 4*(n-1);
4428     for (i=n-1;i>=0;i--) {
4429       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4430       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4431       PetscScalar *xtemp=&x[idt];
4432       MatScalar   *ttemp=&t[idt];
4433       xtemp[3] = (PetscScalar)ttemp[3];
4434       xtemp[2] = (PetscScalar)ttemp[2];
4435       xtemp[1] = (PetscScalar)ttemp[1];
4436       xtemp[0] = (PetscScalar)ttemp[0];
4437       idt -= 4;
4438     }
4439 
4440   } /* End of artificial scope. */
4441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4443   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4444   SSE_SCOPE_END;
4445   PetscFunctionReturn(0);
4446 }
4447 
4448 #endif
4449 
4450 #undef __FUNCT__
4451 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4452 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4453 {
4454   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4455   IS                iscol=a->col,isrow=a->row;
4456   PetscErrorCode    ierr;
4457   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4458   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4459   const MatScalar   *aa=a->a,*v;
4460   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4461   const PetscScalar *b;
4462 
4463   PetscFunctionBegin;
4464   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4465   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4466   t  = a->solve_work;
4467 
4468   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4469   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4470 
4471   /* forward solve the lower triangular */
4472   idx    = 3*(*r++);
4473   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4474   for (i=1; i<n; i++) {
4475     v     = aa + 9*ai[i];
4476     vi    = aj + ai[i];
4477     nz    = diag[i] - ai[i];
4478     idx   = 3*(*r++);
4479     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4480     while (nz--) {
4481       idx   = 3*(*vi++);
4482       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4483       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4484       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4485       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4486       v += 9;
4487     }
4488     idx = 3*i;
4489     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4490   }
4491   /* backward solve the upper triangular */
4492   for (i=n-1; i>=0; i--){
4493     v    = aa + 9*diag[i] + 9;
4494     vi   = aj + diag[i] + 1;
4495     nz   = ai[i+1] - diag[i] - 1;
4496     idt  = 3*i;
4497     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4498     while (nz--) {
4499       idx   = 3*(*vi++);
4500       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4501       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4502       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4503       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4504       v += 9;
4505     }
4506     idc = 3*(*c--);
4507     v   = aa + 9*diag[i];
4508     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4509     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4510     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4511   }
4512   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4513   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4514   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4515   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4516   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4517   PetscFunctionReturn(0);
4518 }
4519 
4520 #undef __FUNCT__
4521 #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4522 PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
4523 {
4524   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4525   IS                iscol=a->col,isrow=a->row;
4526   PetscErrorCode    ierr;
4527   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4528   const PetscInt    *r,*c,*rout,*cout;
4529   const MatScalar   *aa=a->a,*v;
4530   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4531   const PetscScalar *b;
4532 
4533   PetscFunctionBegin;
4534   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4535   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4536   t  = a->solve_work;
4537 
4538   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4539   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4540 
4541   /* forward solve the lower triangular */
4542   idx    = 3*r[0];
4543   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4544   for (i=1; i<n; i++) {
4545     v     = aa + 9*ai[i];
4546     vi    = aj + ai[i];
4547     nz    = ai[i+1] - ai[i];
4548     idx   = 3*r[i];
4549     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4550     for(m=0;m<nz;m++){
4551       idx   = 3*vi[m];
4552       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4553       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4554       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4555       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4556       v += 9;
4557     }
4558     idx = 3*i;
4559     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4560   }
4561   /* backward solve the upper triangular */
4562   for (i=n-1; i>=0; i--){
4563     v    = aa + 9*(adiag[i+1]+1);
4564     vi   = aj + adiag[i+1]+1;
4565     nz   = adiag[i] - adiag[i+1] - 1;
4566     idt  = 3*i;
4567     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4568     for(m=0;m<nz;m++){
4569       idx   = 3*vi[m];
4570       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4571       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4572       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4573       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4574       v += 9;
4575     }
4576     idc = 3*c[i];
4577     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4578     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4579     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4580   }
4581   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4582   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4583   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4584   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4585   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4586   PetscFunctionReturn(0);
4587 }
4588 
4589 /*
4590       Special case where the matrix was ILU(0) factored in the natural
4591    ordering. This eliminates the need for the column and row permutation.
4592 */
4593 #undef __FUNCT__
4594 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4595 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4596 {
4597   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4598   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4599   PetscErrorCode    ierr;
4600   PetscInt          *diag = a->diag;
4601   const MatScalar   *aa=a->a,*v;
4602   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4603   const PetscScalar *b;
4604   PetscInt          jdx,idt,idx,nz,*vi,i;
4605 
4606   PetscFunctionBegin;
4607   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4608   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4609 
4610   /* forward solve the lower triangular */
4611   idx    = 0;
4612   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4613   for (i=1; i<n; i++) {
4614     v     =  aa      + 9*ai[i];
4615     vi    =  aj      + ai[i];
4616     nz    =  diag[i] - ai[i];
4617     idx   +=  3;
4618     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4619     while (nz--) {
4620       jdx   = 3*(*vi++);
4621       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4622       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4623       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4624       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4625       v    += 9;
4626     }
4627     x[idx]   = s1;
4628     x[1+idx] = s2;
4629     x[2+idx] = s3;
4630   }
4631   /* backward solve the upper triangular */
4632   for (i=n-1; i>=0; i--){
4633     v    = aa + 9*diag[i] + 9;
4634     vi   = aj + diag[i] + 1;
4635     nz   = ai[i+1] - diag[i] - 1;
4636     idt  = 3*i;
4637     s1 = x[idt];  s2 = x[1+idt];
4638     s3 = x[2+idt];
4639     while (nz--) {
4640       idx   = 3*(*vi++);
4641       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4642       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4643       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4644       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4645       v    += 9;
4646     }
4647     v        = aa +  9*diag[i];
4648     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4649     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4650     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4651   }
4652 
4653   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4654   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4655   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4656   PetscFunctionReturn(0);
4657 }
4658 
4659 #undef __FUNCT__
4660 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4661 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4662 {
4663     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4664     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4665     PetscErrorCode    ierr;
4666     PetscInt          idx,jdx,idt;
4667     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4668     const MatScalar   *aa=a->a,*v;
4669     PetscScalar       *x;
4670     const PetscScalar *b;
4671     PetscScalar        s1,s2,s3,x1,x2,x3;
4672 
4673     PetscFunctionBegin;
4674     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4675     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4676     /* forward solve the lower triangular */
4677     idx    = 0;
4678     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4679     for (i=1; i<n; i++) {
4680        v    = aa + bs2*ai[i];
4681        vi   = aj + ai[i];
4682        nz   = ai[i+1] - ai[i];
4683       idx   = bs*i;
4684        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4685       for(k=0;k<nz;k++){
4686          jdx   = bs*vi[k];
4687           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4688           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4689           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4690           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4691 
4692           v   +=  bs2;
4693         }
4694 
4695        x[idx]   = s1;
4696        x[1+idx] = s2;
4697        x[2+idx] = s3;
4698     }
4699 
4700    /* backward solve the upper triangular */
4701   for (i=n-1; i>=0; i--){
4702     v   = aa + bs2*(adiag[i+1]+1);
4703      vi  = aj + adiag[i+1]+1;
4704      nz  = adiag[i] - adiag[i+1]-1;
4705      idt = bs*i;
4706      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4707 
4708      for(k=0;k<nz;k++){
4709        idx   = bs*vi[k];
4710        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4711        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4712        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4713        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4714 
4715         v   +=  bs2;
4716     }
4717     /* x = inv_diagonal*x */
4718    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4719    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4720    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4721 
4722   }
4723 
4724   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4725   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4726   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4727   PetscFunctionReturn(0);
4728 }
4729 
4730 #undef __FUNCT__
4731 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4732 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4733 {
4734   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4735   IS                iscol=a->col,isrow=a->row;
4736   PetscErrorCode    ierr;
4737   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4738   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4739   const MatScalar   *aa=a->a,*v;
4740   PetscScalar       *x,s1,s2,x1,x2,*t;
4741   const PetscScalar *b;
4742 
4743   PetscFunctionBegin;
4744   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4745   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4746   t  = a->solve_work;
4747 
4748   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4749   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4750 
4751   /* forward solve the lower triangular */
4752   idx    = 2*(*r++);
4753   t[0] = b[idx]; t[1] = b[1+idx];
4754   for (i=1; i<n; i++) {
4755     v     = aa + 4*ai[i];
4756     vi    = aj + ai[i];
4757     nz    = diag[i] - ai[i];
4758     idx   = 2*(*r++);
4759     s1  = b[idx]; s2 = b[1+idx];
4760     while (nz--) {
4761       idx   = 2*(*vi++);
4762       x1    = t[idx]; x2 = t[1+idx];
4763       s1 -= v[0]*x1 + v[2]*x2;
4764       s2 -= v[1]*x1 + v[3]*x2;
4765       v += 4;
4766     }
4767     idx = 2*i;
4768     t[idx] = s1; t[1+idx] = s2;
4769   }
4770   /* backward solve the upper triangular */
4771   for (i=n-1; i>=0; i--){
4772     v    = aa + 4*diag[i] + 4;
4773     vi   = aj + diag[i] + 1;
4774     nz   = ai[i+1] - diag[i] - 1;
4775     idt  = 2*i;
4776     s1 = t[idt]; s2 = t[1+idt];
4777     while (nz--) {
4778       idx   = 2*(*vi++);
4779       x1    = t[idx]; x2 = t[1+idx];
4780       s1 -= v[0]*x1 + v[2]*x2;
4781       s2 -= v[1]*x1 + v[3]*x2;
4782       v += 4;
4783     }
4784     idc = 2*(*c--);
4785     v   = aa + 4*diag[i];
4786     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4787     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4788   }
4789   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4790   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4791   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4792   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4793   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4794   PetscFunctionReturn(0);
4795 }
4796 
4797 #undef __FUNCT__
4798 #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4799 PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
4800 {
4801   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4802   IS                iscol=a->col,isrow=a->row;
4803   PetscErrorCode    ierr;
4804   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4805   const PetscInt    *r,*c,*rout,*cout;
4806   const MatScalar   *aa=a->a,*v;
4807   PetscScalar       *x,s1,s2,x1,x2,*t;
4808   const PetscScalar *b;
4809 
4810   PetscFunctionBegin;
4811   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4812   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4813   t  = a->solve_work;
4814 
4815   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4816   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4817 
4818   /* forward solve the lower triangular */
4819   idx    = 2*r[0];
4820   t[0] = b[idx]; t[1] = b[1+idx];
4821   for (i=1; i<n; i++) {
4822     v     = aa + 4*ai[i];
4823     vi    = aj + ai[i];
4824     nz    = ai[i+1] - ai[i];
4825     idx   = 2*r[i];
4826     s1  = b[idx]; s2 = b[1+idx];
4827     for(m=0;m<nz;m++){
4828       jdx   = 2*vi[m];
4829       x1    = t[jdx]; x2 = t[1+jdx];
4830       s1 -= v[0]*x1 + v[2]*x2;
4831       s2 -= v[1]*x1 + v[3]*x2;
4832       v += 4;
4833     }
4834     idx = 2*i;
4835     t[idx] = s1; t[1+idx] = s2;
4836   }
4837   /* backward solve the upper triangular */
4838   for (i=n-1; i>=0; i--){
4839     v    = aa + 4*(adiag[i+1]+1);
4840     vi   = aj + adiag[i+1]+1;
4841     nz   = adiag[i] - adiag[i+1] - 1;
4842     idt  = 2*i;
4843     s1 = t[idt]; s2 = t[1+idt];
4844     for(m=0;m<nz;m++){
4845       idx   = 2*vi[m];
4846       x1    = t[idx]; x2 = t[1+idx];
4847       s1 -= v[0]*x1 + v[2]*x2;
4848       s2 -= v[1]*x1 + v[3]*x2;
4849       v += 4;
4850     }
4851     idc = 2*c[i];
4852     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4853     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4854   }
4855   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4856   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4857   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4858   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4859   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4860   PetscFunctionReturn(0);
4861 }
4862 
4863 /*
4864       Special case where the matrix was ILU(0) factored in the natural
4865    ordering. This eliminates the need for the column and row permutation.
4866 */
4867 #undef __FUNCT__
4868 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4869 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
4870 {
4871   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4872   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4873   PetscErrorCode    ierr;
4874   PetscInt          *diag = a->diag;
4875   const MatScalar   *aa=a->a,*v;
4876   PetscScalar       *x,s1,s2,x1,x2;
4877   const PetscScalar *b;
4878   PetscInt          jdx,idt,idx,nz,*vi,i;
4879 
4880   PetscFunctionBegin;
4881   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4882   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4883 
4884   /* forward solve the lower triangular */
4885   idx    = 0;
4886   x[0]   = b[0]; x[1] = b[1];
4887   for (i=1; i<n; i++) {
4888     v     =  aa      + 4*ai[i];
4889     vi    =  aj      + ai[i];
4890     nz    =  diag[i] - ai[i];
4891     idx   +=  2;
4892     s1  =  b[idx];s2 = b[1+idx];
4893     while (nz--) {
4894       jdx   = 2*(*vi++);
4895       x1    = x[jdx];x2 = x[1+jdx];
4896       s1 -= v[0]*x1 + v[2]*x2;
4897       s2 -= v[1]*x1 + v[3]*x2;
4898       v    += 4;
4899     }
4900     x[idx]   = s1;
4901     x[1+idx] = s2;
4902   }
4903   /* backward solve the upper triangular */
4904   for (i=n-1; i>=0; i--){
4905     v    = aa + 4*diag[i] + 4;
4906     vi   = aj + diag[i] + 1;
4907     nz   = ai[i+1] - diag[i] - 1;
4908     idt  = 2*i;
4909     s1 = x[idt];  s2 = x[1+idt];
4910     while (nz--) {
4911       idx   = 2*(*vi++);
4912       x1    = x[idx];   x2 = x[1+idx];
4913       s1 -= v[0]*x1 + v[2]*x2;
4914       s2 -= v[1]*x1 + v[3]*x2;
4915       v    += 4;
4916     }
4917     v        = aa +  4*diag[i];
4918     x[idt]   = v[0]*s1 + v[2]*s2;
4919     x[1+idt] = v[1]*s1 + v[3]*s2;
4920   }
4921 
4922   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4923   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4924   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4925   PetscFunctionReturn(0);
4926 }
4927 
4928 #undef __FUNCT__
4929 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4930 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4931 {
4932     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4933     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4934     PetscErrorCode    ierr;
4935     PetscInt          jdx;
4936     const MatScalar   *aa=a->a,*v;
4937     PetscScalar       *x,s1,s2,x1,x2;
4938     const PetscScalar *b;
4939 
4940     PetscFunctionBegin;
4941     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4942     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4943     /* forward solve the lower triangular */
4944     idx    = 0;
4945     x[0] = b[idx]; x[1] = b[1+idx];
4946     for (i=1; i<n; i++) {
4947         v   = aa + 4*ai[i];
4948        vi   = aj + ai[i];
4949        nz   = ai[i+1] - ai[i];
4950        idx  = 2*i;
4951        s1   = b[idx];s2 = b[1+idx];
4952       for(k=0;k<nz;k++){
4953          jdx   = 2*vi[k];
4954           x1    = x[jdx];x2 = x[1+jdx];
4955           s1   -= v[0]*x1 + v[2]*x2;
4956           s2   -= v[1]*x1 + v[3]*x2;
4957            v   +=  4;
4958         }
4959        x[idx]   = s1;
4960        x[1+idx] = s2;
4961     }
4962 
4963    /* backward solve the upper triangular */
4964   for (i=n-1; i>=0; i--){
4965      v   = aa + 4*(adiag[i+1]+1);
4966      vi  = aj + adiag[i+1]+1;
4967      nz  = adiag[i] - adiag[i+1]-1;
4968      idt = 2*i;
4969      s1 = x[idt];  s2 = x[1+idt];
4970      for(k=0;k<nz;k++){
4971       idx   = 2*vi[k];
4972        x1    = x[idx];   x2 = x[1+idx];
4973        s1 -= v[0]*x1 + v[2]*x2;
4974        s2 -= v[1]*x1 + v[3]*x2;
4975          v    += 4;
4976     }
4977     /* x = inv_diagonal*x */
4978    x[idt]   = v[0]*s1 + v[2]*s2;
4979    x[1+idt] = v[1]*s1 + v[3]*s2;
4980   }
4981 
4982   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4983   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4984   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4985   PetscFunctionReturn(0);
4986 }
4987 
4988 #undef __FUNCT__
4989 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4990 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
4991 {
4992   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
4993   IS             iscol=a->col,isrow=a->row;
4994   PetscErrorCode ierr;
4995   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4996   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
4997   MatScalar      *aa=a->a,*v;
4998   PetscScalar    *x,*b,s1,*t;
4999 
5000   PetscFunctionBegin;
5001   if (!n) PetscFunctionReturn(0);
5002 
5003   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5004   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5005   t  = a->solve_work;
5006 
5007   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5008   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5009 
5010   /* forward solve the lower triangular */
5011   t[0] = b[*r++];
5012   for (i=1; i<n; i++) {
5013     v     = aa + ai[i];
5014     vi    = aj + ai[i];
5015     nz    = diag[i] - ai[i];
5016     s1  = b[*r++];
5017     while (nz--) {
5018       s1 -= (*v++)*t[*vi++];
5019     }
5020     t[i] = s1;
5021   }
5022   /* backward solve the upper triangular */
5023   for (i=n-1; i>=0; i--){
5024     v    = aa + diag[i] + 1;
5025     vi   = aj + diag[i] + 1;
5026     nz   = ai[i+1] - diag[i] - 1;
5027     s1 = t[i];
5028     while (nz--) {
5029       s1 -= (*v++)*t[*vi++];
5030     }
5031     x[*c--] = t[i] = aa[diag[i]]*s1;
5032   }
5033 
5034   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5035   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5036   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5037   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5038   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5039   PetscFunctionReturn(0);
5040 }
5041 /*
5042       Special case where the matrix was ILU(0) factored in the natural
5043    ordering. This eliminates the need for the column and row permutation.
5044 */
5045 #undef __FUNCT__
5046 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5047 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5048 {
5049   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5050   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
5051   PetscErrorCode ierr;
5052   PetscInt       *diag = a->diag;
5053   MatScalar      *aa=a->a;
5054   PetscScalar    *x,*b;
5055   PetscScalar    s1,x1;
5056   MatScalar      *v;
5057   PetscInt       jdx,idt,idx,nz,*vi,i;
5058 
5059   PetscFunctionBegin;
5060   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5061   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5062 
5063   /* forward solve the lower triangular */
5064   idx    = 0;
5065   x[0]   = b[0];
5066   for (i=1; i<n; i++) {
5067     v     =  aa      + ai[i];
5068     vi    =  aj      + ai[i];
5069     nz    =  diag[i] - ai[i];
5070     idx   +=  1;
5071     s1  =  b[idx];
5072     while (nz--) {
5073       jdx   = *vi++;
5074       x1    = x[jdx];
5075       s1 -= v[0]*x1;
5076       v    += 1;
5077     }
5078     x[idx]   = s1;
5079   }
5080   /* backward solve the upper triangular */
5081   for (i=n-1; i>=0; i--){
5082     v    = aa + diag[i] + 1;
5083     vi   = aj + diag[i] + 1;
5084     nz   = ai[i+1] - diag[i] - 1;
5085     idt  = i;
5086     s1 = x[idt];
5087     while (nz--) {
5088       idx   = *vi++;
5089       x1    = x[idx];
5090       s1 -= v[0]*x1;
5091       v    += 1;
5092     }
5093     v        = aa +  diag[i];
5094     x[idt]   = v[0]*s1;
5095   }
5096   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5097   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5098   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5099   PetscFunctionReturn(0);
5100 }
5101 
5102 /* ----------------------------------------------------------------*/
5103 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5104 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
5105 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
5106 
5107 #undef __FUNCT__
5108 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
5109 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
5110 {
5111   Mat            C=B;
5112   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5113   IS             isrow = b->row,isicol = b->icol;
5114   PetscErrorCode ierr;
5115   const PetscInt *r,*ic,*ics;
5116   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5117   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5118   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5119   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5120   MatScalar      *v_work;
5121   PetscTruth     col_identity,row_identity,both_identity;
5122 
5123   PetscFunctionBegin;
5124   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5125   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5126 
5127   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5128   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5129   ics  = ic;
5130 
5131   /* generate work space needed by dense LU factorization */
5132   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5133 
5134   for (i=0; i<n; i++){
5135     /* zero rtmp */
5136     /* L part */
5137     nz    = bi[i+1] - bi[i];
5138     bjtmp = bj + bi[i];
5139     for  (j=0; j<nz; j++){
5140       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5141     }
5142 
5143     /* U part */
5144     nz = bdiag[i] - bdiag[i+1];
5145     bjtmp = bj + bdiag[i+1]+1;
5146     for  (j=0; j<nz; j++){
5147       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5148     }
5149 
5150     /* load in initial (unfactored row) */
5151     nz    = ai[r[i]+1] - ai[r[i]];
5152     ajtmp = aj + ai[r[i]];
5153     v     = aa + bs2*ai[r[i]];
5154     for (j=0; j<nz; j++) {
5155       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5156     }
5157 
5158     /* elimination */
5159     bjtmp = bj + bi[i];
5160     nzL   = bi[i+1] - bi[i];
5161     for(k=0;k < nzL;k++) {
5162       row = bjtmp[k];
5163       pc = rtmp + bs2*row;
5164       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5165       if (flg) {
5166         pv         = b->a + bs2*bdiag[row];
5167         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5168         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5169         pv         = b->a + bs2*(bdiag[row+1]+1);
5170         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5171         for (j=0; j<nz; j++) {
5172           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5173         }
5174         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5175       }
5176     }
5177 
5178     /* finished row so stick it into b->a */
5179     /* L part */
5180     pv   = b->a + bs2*bi[i] ;
5181     pj   = b->j + bi[i] ;
5182     nz   = bi[i+1] - bi[i];
5183     for (j=0; j<nz; j++) {
5184       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5185     }
5186 
5187     /* Mark diagonal and invert diagonal for simplier triangular solves */
5188     pv  = b->a + bs2*bdiag[i];
5189     pj  = b->j + bdiag[i];
5190     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5191     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5192     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5193 
5194     /* U part */
5195     pv = b->a + bs2*(bdiag[i+1]+1);
5196     pj = b->j + bdiag[i+1]+1;
5197     nz = bdiag[i] - bdiag[i+1] - 1;
5198     for (j=0; j<nz; j++){
5199       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5200     }
5201   }
5202 
5203   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5204   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5205   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5206   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5207 
5208   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5209   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5210   both_identity = (PetscTruth) (row_identity && col_identity);
5211   if (both_identity){
5212     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5213   } else {
5214     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
5215   }
5216   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct;
5217 
5218   C->assembled = PETSC_TRUE;
5219   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5220   PetscFunctionReturn(0);
5221 }
5222 
5223 /*
5224    ilu(0) with natural ordering under new data structure.
5225    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
5226    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
5227 */
5228 
5229 #undef __FUNCT__
5230 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
5231 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5232 {
5233 
5234   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5235   PetscErrorCode     ierr;
5236   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5237   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5238 
5239   PetscFunctionBegin;
5240   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5241   b    = (Mat_SeqBAIJ*)(fact)->data;
5242 
5243   /* allocate matrix arrays for new data structure */
5244   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5245   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5246   b->singlemalloc = PETSC_TRUE;
5247   if (!b->diag){
5248     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5249     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5250   }
5251   bdiag = b->diag;
5252 
5253   if (n > 0) {
5254     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5255   }
5256 
5257   /* set bi and bj with new data structure */
5258   bi = b->i;
5259   bj = b->j;
5260 
5261   /* L part */
5262   bi[0] = 0;
5263   for (i=0; i<n; i++){
5264     nz = adiag[i] - ai[i];
5265     bi[i+1] = bi[i] + nz;
5266     aj = a->j + ai[i];
5267     for (j=0; j<nz; j++){
5268       *bj = aj[j]; bj++;
5269     }
5270   }
5271 
5272   /* U part */
5273   bi_temp = bi[n];
5274   bdiag[n] = bi[n]-1;
5275   for (i=n-1; i>=0; i--){
5276     nz = ai[i+1] - adiag[i] - 1;
5277     bi_temp = bi_temp + nz + 1;
5278     aj = a->j + adiag[i] + 1;
5279     for (j=0; j<nz; j++){
5280       *bj = aj[j]; bj++;
5281     }
5282     /* diag[i] */
5283     *bj = i; bj++;
5284     bdiag[i] = bi_temp - 1;
5285   }
5286   PetscFunctionReturn(0);
5287 }
5288 
5289 #undef __FUNCT__
5290 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
5291 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5292 {
5293   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5294   IS                 isicol;
5295   PetscErrorCode     ierr;
5296   const PetscInt     *r,*ic;
5297   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5298   PetscInt           *bi,*cols,nnz,*cols_lvl;
5299   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5300   PetscInt           i,levels,diagonal_fill;
5301   PetscTruth         col_identity,row_identity,both_identity;
5302   PetscReal          f;
5303   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5304   PetscBT            lnkbt;
5305   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5306   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5307   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5308   PetscTruth         missing;
5309   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5310 
5311   PetscFunctionBegin;
5312   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5313   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5314   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5315 
5316   f             = info->fill;
5317   levels        = (PetscInt)info->levels;
5318   diagonal_fill = (PetscInt)info->diagonal_fill;
5319   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5320 
5321   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5322   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5323   both_identity = (PetscTruth) (row_identity && col_identity);
5324 
5325   if (!levels && both_identity) {
5326     /* special case: ilu(0) with natural ordering */
5327     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5328     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
5329 
5330     fact->factor = MAT_FACTOR_ILU;
5331     (fact)->info.factor_mallocs    = 0;
5332     (fact)->info.fill_ratio_given  = info->fill;
5333     (fact)->info.fill_ratio_needed = 1.0;
5334     b                = (Mat_SeqBAIJ*)(fact)->data;
5335     b->row           = isrow;
5336     b->col           = iscol;
5337     b->icol          = isicol;
5338     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5339     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5340     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5341     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5342     PetscFunctionReturn(0);
5343   }
5344 
5345   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5346   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5347 
5348   /* get new row pointers */
5349   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5350   bi[0] = 0;
5351   /* bdiag is location of diagonal in factor */
5352   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5353   bdiag[0]  = 0;
5354 
5355   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5356 
5357   /* create a linked list for storing column indices of the active row */
5358   nlnk = n + 1;
5359   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5360 
5361   /* initial FreeSpace size is f*(ai[n]+1) */
5362   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5363   current_space = free_space;
5364   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5365   current_space_lvl = free_space_lvl;
5366 
5367   for (i=0; i<n; i++) {
5368     nzi = 0;
5369     /* copy current row into linked list */
5370     nnz  = ai[r[i]+1] - ai[r[i]];
5371     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5372     cols = aj + ai[r[i]];
5373     lnk[i] = -1; /* marker to indicate if diagonal exists */
5374     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5375     nzi += nlnk;
5376 
5377     /* make sure diagonal entry is included */
5378     if (diagonal_fill && lnk[i] == -1) {
5379       fm = n;
5380       while (lnk[fm] < i) fm = lnk[fm];
5381       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5382       lnk[fm]    = i;
5383       lnk_lvl[i] = 0;
5384       nzi++; dcount++;
5385     }
5386 
5387     /* add pivot rows into the active row */
5388     nzbd = 0;
5389     prow = lnk[n];
5390     while (prow < i) {
5391       nnz      = bdiag[prow];
5392       cols     = bj_ptr[prow] + nnz + 1;
5393       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5394       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5395       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5396       nzi += nlnk;
5397       prow = lnk[prow];
5398       nzbd++;
5399     }
5400     bdiag[i] = nzbd;
5401     bi[i+1]  = bi[i] + nzi;
5402 
5403     /* if free space is not available, make more free space */
5404     if (current_space->local_remaining<nzi) {
5405       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5406       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5407       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5408       reallocs++;
5409     }
5410 
5411     /* copy data into free_space and free_space_lvl, then initialize lnk */
5412     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5413     bj_ptr[i]    = current_space->array;
5414     bjlvl_ptr[i] = current_space_lvl->array;
5415 
5416     /* make sure the active row i has diagonal entry */
5417     if (*(bj_ptr[i]+bdiag[i]) != i) {
5418       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5419     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5420     }
5421 
5422     current_space->array           += nzi;
5423     current_space->local_used      += nzi;
5424     current_space->local_remaining -= nzi;
5425     current_space_lvl->array           += nzi;
5426     current_space_lvl->local_used      += nzi;
5427     current_space_lvl->local_remaining -= nzi;
5428   }
5429 
5430   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5431   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5432 
5433   /* destroy list of free space and other temporary arrays */
5434   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5435 
5436   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5437   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5438 
5439   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5440   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5441   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5442 
5443 #if defined(PETSC_USE_INFO)
5444   {
5445     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5446     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5447     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5448     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5449     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5450     if (diagonal_fill) {
5451       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5452     }
5453   }
5454 #endif
5455 
5456   /* put together the new matrix */
5457   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5458   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5459   b = (Mat_SeqBAIJ*)(fact)->data;
5460   b->free_a       = PETSC_TRUE;
5461   b->free_ij      = PETSC_TRUE;
5462   b->singlemalloc = PETSC_FALSE;
5463   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5464   b->j          = bj;
5465   b->i          = bi;
5466   b->diag       = bdiag;
5467   b->free_diag  = PETSC_TRUE;
5468   b->ilen       = 0;
5469   b->imax       = 0;
5470   b->row        = isrow;
5471   b->col        = iscol;
5472   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5473   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5474   b->icol       = isicol;
5475   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5476   /* In b structure:  Free imax, ilen, old a, old j.
5477      Allocate bdiag, solve_work, new a, new j */
5478   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5479   b->maxnz = b->nz = bdiag[0]+1;
5480   fact->info.factor_mallocs    = reallocs;
5481   fact->info.fill_ratio_given  = f;
5482   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5483   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
5484   PetscFunctionReturn(0);
5485 }
5486 
5487 
5488 /*
5489      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5490    except that the data structure of Mat_SeqAIJ is slightly different.
5491    Not a good example of code reuse.
5492 */
5493 #undef __FUNCT__
5494 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5495 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5496 {
5497   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5498   IS             isicol;
5499   PetscErrorCode ierr;
5500   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5501   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5502   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5503   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5504   PetscTruth     col_identity,row_identity,both_identity,flg;
5505   PetscReal      f;
5506   PetscTruth     newdatastruct = PETSC_FALSE;
5507 
5508   PetscFunctionBegin;
5509   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5510   if (newdatastruct){
5511     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5512     PetscFunctionReturn(0);
5513   }
5514 
5515   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5516   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5517 
5518   f             = info->fill;
5519   levels        = (PetscInt)info->levels;
5520   diagonal_fill = (PetscInt)info->diagonal_fill;
5521   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5522 
5523   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5524   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5525   both_identity = (PetscTruth) (row_identity && col_identity);
5526 
5527   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5528     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5529     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5530 
5531     fact->factor = MAT_FACTOR_ILU;
5532     b            = (Mat_SeqBAIJ*)fact->data;
5533     b->row       = isrow;
5534     b->col       = iscol;
5535     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5536     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5537     b->icol      = isicol;
5538     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5539     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5540     PetscFunctionReturn(0);
5541   }
5542 
5543   /* general case perform the symbolic factorization */
5544     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5545     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5546 
5547     /* get new row pointers */
5548     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5549     ainew[0] = 0;
5550     /* don't know how many column pointers are needed so estimate */
5551     jmax = (PetscInt)(f*ai[n] + 1);
5552     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5553     /* ajfill is level of fill for each fill entry */
5554     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5555     /* fill is a linked list of nonzeros in active row */
5556     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5557     /* im is level for each filled value */
5558     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5559     /* dloc is location of diagonal in factor */
5560     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5561     dloc[0]  = 0;
5562     for (prow=0; prow<n; prow++) {
5563 
5564       /* copy prow into linked list */
5565       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5566       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5567       xi         = aj + ai[r[prow]];
5568       fill[n]    = n;
5569       fill[prow] = -1; /* marker for diagonal entry */
5570       while (nz--) {
5571 	fm  = n;
5572 	idx = ic[*xi++];
5573 	do {
5574 	  m  = fm;
5575 	  fm = fill[m];
5576 	} while (fm < idx);
5577 	fill[m]   = idx;
5578 	fill[idx] = fm;
5579 	im[idx]   = 0;
5580       }
5581 
5582       /* make sure diagonal entry is included */
5583       if (diagonal_fill && fill[prow] == -1) {
5584 	fm = n;
5585 	while (fill[fm] < prow) fm = fill[fm];
5586 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5587 	fill[fm]   = prow;
5588 	im[prow]   = 0;
5589 	nzf++;
5590 	dcount++;
5591       }
5592 
5593       nzi = 0;
5594       row = fill[n];
5595       while (row < prow) {
5596 	incrlev = im[row] + 1;
5597 	nz      = dloc[row];
5598 	xi      = ajnew  + ainew[row] + nz + 1;
5599 	flev    = ajfill + ainew[row] + nz + 1;
5600 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5601 	fm      = row;
5602 	while (nnz-- > 0) {
5603 	  idx = *xi++;
5604 	  if (*flev + incrlev > levels) {
5605 	    flev++;
5606 	    continue;
5607 	  }
5608 	  do {
5609 	    m  = fm;
5610 	    fm = fill[m];
5611 	  } while (fm < idx);
5612 	  if (fm != idx) {
5613 	    im[idx]   = *flev + incrlev;
5614 	    fill[m]   = idx;
5615 	    fill[idx] = fm;
5616 	    fm        = idx;
5617 	    nzf++;
5618 	  } else {
5619 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5620 	  }
5621 	  flev++;
5622 	}
5623 	row = fill[row];
5624 	nzi++;
5625       }
5626       /* copy new filled row into permanent storage */
5627       ainew[prow+1] = ainew[prow] + nzf;
5628       if (ainew[prow+1] > jmax) {
5629 
5630 	/* estimate how much additional space we will need */
5631 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5632 	/* just double the memory each time */
5633 	PetscInt maxadd = jmax;
5634 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5635 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5636 	jmax += maxadd;
5637 
5638 	/* allocate a longer ajnew and ajfill */
5639 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5640 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5641 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5642 	ajnew = xitmp;
5643 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5644 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5645 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5646 	ajfill = xitmp;
5647 	reallocate++; /* count how many reallocations are needed */
5648       }
5649       xitmp       = ajnew + ainew[prow];
5650       flev        = ajfill + ainew[prow];
5651       dloc[prow]  = nzi;
5652       fm          = fill[n];
5653       while (nzf--) {
5654 	*xitmp++ = fm;
5655 	*flev++ = im[fm];
5656 	fm      = fill[fm];
5657       }
5658       /* make sure row has diagonal entry */
5659       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5660 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5661     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5662       }
5663     }
5664     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5665     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5666     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5667     ierr = PetscFree(fill);CHKERRQ(ierr);
5668     ierr = PetscFree(im);CHKERRQ(ierr);
5669 
5670 #if defined(PETSC_USE_INFO)
5671     {
5672       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5673       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5674       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5675       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5676       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5677       if (diagonal_fill) {
5678 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5679       }
5680     }
5681 #endif
5682 
5683     /* put together the new matrix */
5684     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5685     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5686     b    = (Mat_SeqBAIJ*)fact->data;
5687     b->free_a       = PETSC_TRUE;
5688     b->free_ij      = PETSC_TRUE;
5689     b->singlemalloc = PETSC_FALSE;
5690     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5691     b->j          = ajnew;
5692     b->i          = ainew;
5693     for (i=0; i<n; i++) dloc[i] += ainew[i];
5694     b->diag       = dloc;
5695     b->free_diag  = PETSC_TRUE;
5696     b->ilen       = 0;
5697     b->imax       = 0;
5698     b->row        = isrow;
5699     b->col        = iscol;
5700     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5701     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5702     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5703     b->icol       = isicol;
5704     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5705     /* In b structure:  Free imax, ilen, old a, old j.
5706        Allocate dloc, solve_work, new a, new j */
5707     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5708     b->maxnz          = b->nz = ainew[n];
5709 
5710     fact->info.factor_mallocs    = reallocate;
5711     fact->info.fill_ratio_given  = f;
5712     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5713 
5714   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5715   PetscFunctionReturn(0);
5716 }
5717 
5718 #undef __FUNCT__
5719 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5720 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5721 {
5722   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5723   /* int i,*AJ=a->j,nz=a->nz; */
5724   PetscFunctionBegin;
5725   /* Undo Column scaling */
5726 /*    while (nz--) { */
5727 /*      AJ[i] = AJ[i]/4; */
5728 /*    } */
5729   /* This should really invoke a push/pop logic, but we don't have that yet. */
5730   A->ops->setunfactored = PETSC_NULL;
5731   PetscFunctionReturn(0);
5732 }
5733 
5734 #undef __FUNCT__
5735 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5736 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5737 {
5738   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5739   PetscInt       *AJ=a->j,nz=a->nz;
5740   unsigned short *aj=(unsigned short *)AJ;
5741   PetscFunctionBegin;
5742   /* Is this really necessary? */
5743   while (nz--) {
5744     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5745   }
5746   A->ops->setunfactored = PETSC_NULL;
5747   PetscFunctionReturn(0);
5748 }
5749 
5750 
5751