xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ba337c4413f444c9b07b767e9700a1bd83f660a1)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 #include "petscbt.h"
11 #include "../src/mat/utils/freespace.h"
12 
13 #undef __FUNCT__
14 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
15 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
16 {
17   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18   PetscErrorCode ierr;
19   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20   PetscInt       *diag = a->diag;
21   MatScalar      *aa=a->a,*v;
22   PetscScalar    s1,*x,*b;
23 
24   PetscFunctionBegin;
25   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
26   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28 
29   /* forward solve the U^T */
30   for (i=0; i<n; i++) {
31 
32     v     = aa + diag[i];
33     /* multiply by the inverse of the block diagonal */
34     s1    = (*v++)*x[i];
35     vi    = aj + diag[i] + 1;
36     nz    = ai[i+1] - diag[i] - 1;
37     while (nz--) {
38       x[*vi++]  -= (*v++)*s1;
39     }
40     x[i]   = s1;
41   }
42   /* backward solve the L^T */
43   for (i=n-1; i>=0; i--){
44     v    = aa + diag[i] - 1;
45     vi   = aj + diag[i] - 1;
46     nz   = diag[i] - ai[i];
47     s1   = x[i];
48     while (nz--) {
49       x[*vi--]   -=  (*v--)*s1;
50     }
51   }
52   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
53   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55   PetscFunctionReturn(0);
56 }
57 
58 #undef __FUNCT__
59 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
60 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61 {
62   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63   PetscErrorCode ierr;
64   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65   PetscInt       *diag = a->diag,oidx;
66   MatScalar      *aa=a->a,*v;
67   PetscScalar    s1,s2,x1,x2;
68   PetscScalar    *x,*b;
69 
70   PetscFunctionBegin;
71   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
73   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74 
75   /* forward solve the U^T */
76   idx = 0;
77   for (i=0; i<n; i++) {
78 
79     v     = aa + 4*diag[i];
80     /* multiply by the inverse of the block diagonal */
81     x1 = x[idx];   x2 = x[1+idx];
82     s1 = v[0]*x1  +  v[1]*x2;
83     s2 = v[2]*x1  +  v[3]*x2;
84     v += 4;
85 
86     vi    = aj + diag[i] + 1;
87     nz    = ai[i+1] - diag[i] - 1;
88     while (nz--) {
89       oidx = 2*(*vi++);
90       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92       v  += 4;
93     }
94     x[idx]   = s1;x[1+idx] = s2;
95     idx += 2;
96   }
97   /* backward solve the L^T */
98   for (i=n-1; i>=0; i--){
99     v    = aa + 4*diag[i] - 4;
100     vi   = aj + diag[i] - 1;
101     nz   = diag[i] - ai[i];
102     idt  = 2*i;
103     s1   = x[idt];  s2 = x[1+idt];
104     while (nz--) {
105       idx   = 2*(*vi--);
106       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108       v -= 4;
109     }
110   }
111   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114   PetscFunctionReturn(0);
115 }
116 
117 #undef __FUNCT__
118 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
119 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
120 {
121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122   PetscErrorCode ierr;
123   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124   PetscInt       nz,idx,idt,j,i,oidx;
125   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
126   MatScalar      *aa=a->a,*v;
127   PetscScalar    s1,s2,x1,x2;
128   PetscScalar    *x,*b;
129 
130   PetscFunctionBegin;
131   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134 
135   /* forward solve the U^T */
136   idx = 0;
137   for (i=0; i<n; i++) {
138     v     = aa + bs2*diag[i];
139     /* multiply by the inverse of the block diagonal */
140     x1 = x[idx];   x2 = x[1+idx];
141     s1 = v[0]*x1  +  v[1]*x2;
142     s2 = v[2]*x1  +  v[3]*x2;
143     v -= bs2;
144 
145     vi    = aj + diag[i] - 1;
146     nz    = diag[i] - diag[i+1] - 1;
147     for(j=0;j>-nz;j--){
148       oidx = bs*vi[j];
149       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151       v  -= bs2;
152     }
153     x[idx]   = s1;x[1+idx] = s2;
154     idx += bs;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + bs2*ai[i];
159     vi   = aj + ai[i];
160     nz   = ai[i+1] - ai[i];
161     idt  = bs*i;
162     s1   = x[idt];  s2 = x[1+idt];
163     for(j=0;j<nz;j++){
164       idx   = bs*vi[j];
165       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167       v += bs2;
168     }
169   }
170   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
171   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173   PetscFunctionReturn(0);
174 }
175 
176 #undef __FUNCT__
177 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
178 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179 {
180   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181   PetscErrorCode ierr;
182   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183   PetscInt       *diag = a->diag,oidx;
184   MatScalar      *aa=a->a,*v;
185   PetscScalar    s1,s2,s3,x1,x2,x3;
186   PetscScalar    *x,*b;
187 
188   PetscFunctionBegin;
189   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
191   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192 
193   /* forward solve the U^T */
194   idx = 0;
195   for (i=0; i<n; i++) {
196 
197     v     = aa + 9*diag[i];
198     /* multiply by the inverse of the block diagonal */
199     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203     v += 9;
204 
205     vi    = aj + diag[i] + 1;
206     nz    = ai[i+1] - diag[i] - 1;
207     while (nz--) {
208       oidx = 3*(*vi++);
209       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212       v  += 9;
213     }
214     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215     idx += 3;
216   }
217   /* backward solve the L^T */
218   for (i=n-1; i>=0; i--){
219     v    = aa + 9*diag[i] - 9;
220     vi   = aj + diag[i] - 1;
221     nz   = diag[i] - ai[i];
222     idt  = 3*i;
223     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224     while (nz--) {
225       idx   = 3*(*vi--);
226       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229       v -= 9;
230     }
231   }
232   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235   PetscFunctionReturn(0);
236 }
237 
238 #undef __FUNCT__
239 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
240 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
241 {
242   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
243   PetscErrorCode ierr;
244   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245   PetscInt       nz,idx,idt,j,i,oidx;
246   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
247   MatScalar      *aa=a->a,*v;
248   PetscScalar    s1,s2,s3,x1,x2,x3;
249   PetscScalar    *x,*b;
250 
251   PetscFunctionBegin;
252   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
254   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255 
256   /* forward solve the U^T */
257   idx = 0;
258   for (i=0; i<n; i++) {
259     v     = aa + bs2*diag[i];
260     /* multiply by the inverse of the block diagonal */
261     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265     v -= bs2;
266 
267     vi    = aj + diag[i] - 1;
268     nz    = diag[i] - diag[i+1] - 1;
269     for(j=0;j>-nz;j--){
270       oidx = bs*vi[j];
271       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274       v  -= bs2;
275     }
276     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277     idx += bs;
278   }
279   /* backward solve the L^T */
280   for (i=n-1; i>=0; i--){
281     v    = aa + bs2*ai[i];
282     vi   = aj + ai[i];
283     nz   = ai[i+1] - ai[i];
284     idt  = bs*i;
285     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286     for(j=0;j<nz;j++){
287       idx   = bs*vi[j];
288       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291       v += bs2;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode ierr;
306   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307   PetscInt       *diag = a->diag,oidx;
308   MatScalar      *aa=a->a,*v;
309   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
310   PetscScalar    *x,*b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 16*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328     v += 16;
329 
330     vi    = aj + diag[i] + 1;
331     nz    = ai[i+1] - diag[i] - 1;
332     while (nz--) {
333       oidx = 4*(*vi++);
334       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338       v  += 16;
339     }
340     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341     idx += 4;
342   }
343   /* backward solve the L^T */
344   for (i=n-1; i>=0; i--){
345     v    = aa + 16*diag[i] - 16;
346     vi   = aj + diag[i] - 1;
347     nz   = diag[i] - ai[i];
348     idt  = 4*i;
349     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350     while (nz--) {
351       idx   = 4*(*vi--);
352       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356       v -= 16;
357     }
358   }
359   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
360   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362   PetscFunctionReturn(0);
363 }
364 
365 #undef __FUNCT__
366 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
367 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
368 {
369   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
370   PetscErrorCode ierr;
371   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372   PetscInt       nz,idx,idt,j,i,oidx;
373   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
374   MatScalar      *aa=a->a,*v;
375   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
376   PetscScalar    *x,*b;
377 
378   PetscFunctionBegin;
379   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382 
383   /* forward solve the U^T */
384   idx = 0;
385   for (i=0; i<n; i++) {
386     v     = aa + bs2*diag[i];
387     /* multiply by the inverse of the block diagonal */
388     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393     v -= bs2;
394 
395     vi    = aj + diag[i] - 1;
396     nz    = diag[i] - diag[i+1] - 1;
397     for(j=0;j>-nz;j--){
398       oidx = bs*vi[j];
399       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403       v  -= bs2;
404     }
405     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406     idx += bs;
407   }
408   /* backward solve the L^T */
409   for (i=n-1; i>=0; i--){
410     v    = aa + bs2*ai[i];
411     vi   = aj + ai[i];
412     nz   = ai[i+1] - ai[i];
413     idt  = bs*i;
414     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415     for(j=0;j<nz;j++){
416       idx   = bs*vi[j];
417       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421       v += bs2;
422     }
423   }
424   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
425   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427   PetscFunctionReturn(0);
428 }
429 
430 #undef __FUNCT__
431 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
432 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433 {
434   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
435   PetscErrorCode ierr;
436   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
437   PetscInt       *diag = a->diag,oidx;
438   MatScalar      *aa=a->a,*v;
439   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
440   PetscScalar    *x,*b;
441 
442   PetscFunctionBegin;
443   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446 
447   /* forward solve the U^T */
448   idx = 0;
449   for (i=0; i<n; i++) {
450 
451     v     = aa + 25*diag[i];
452     /* multiply by the inverse of the block diagonal */
453     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459     v += 25;
460 
461     vi    = aj + diag[i] + 1;
462     nz    = ai[i+1] - diag[i] - 1;
463     while (nz--) {
464       oidx = 5*(*vi++);
465       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470       v  += 25;
471     }
472     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473     idx += 5;
474   }
475   /* backward solve the L^T */
476   for (i=n-1; i>=0; i--){
477     v    = aa + 25*diag[i] - 25;
478     vi   = aj + diag[i] - 1;
479     nz   = diag[i] - ai[i];
480     idt  = 5*i;
481     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482     while (nz--) {
483       idx   = 5*(*vi--);
484       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489       v -= 25;
490     }
491   }
492   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
493   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495   PetscFunctionReturn(0);
496 }
497 
498 #undef __FUNCT__
499 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
500 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
501 {
502   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503   PetscErrorCode ierr;
504   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505   PetscInt       nz,idx,idt,j,i,oidx;
506   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507   MatScalar      *aa=a->a,*v;
508   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
509   PetscScalar    *x,*b;
510 
511   PetscFunctionBegin;
512   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
514   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515 
516   /* forward solve the U^T */
517   idx = 0;
518   for (i=0; i<n; i++) {
519     v     = aa + bs2*diag[i];
520     /* multiply by the inverse of the block diagonal */
521     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522     x5 = x[4+idx];
523     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528     v -= bs2;
529 
530     vi    = aj + diag[i] - 1;
531     nz    = diag[i] - diag[i+1] - 1;
532     for(j=0;j>-nz;j--){
533       oidx = bs*vi[j];
534       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539       v  -= bs2;
540     }
541     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542     idx += bs;
543   }
544   /* backward solve the L^T */
545   for (i=n-1; i>=0; i--){
546     v    = aa + bs2*ai[i];
547     vi   = aj + ai[i];
548     nz   = ai[i+1] - ai[i];
549     idt  = bs*i;
550     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551     for(j=0;j<nz;j++){
552       idx   = bs*vi[j];
553       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558       v += bs2;
559     }
560   }
561   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
562   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564   PetscFunctionReturn(0);
565 }
566 
567 #undef __FUNCT__
568 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
569 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570 {
571   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
572   PetscErrorCode ierr;
573   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
574   PetscInt       *diag = a->diag,oidx;
575   MatScalar      *aa=a->a,*v;
576   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
577   PetscScalar    *x,*b;
578 
579   PetscFunctionBegin;
580   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
582   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583 
584   /* forward solve the U^T */
585   idx = 0;
586   for (i=0; i<n; i++) {
587 
588     v     = aa + 36*diag[i];
589     /* multiply by the inverse of the block diagonal */
590     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591     x6    = x[5+idx];
592     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598     v += 36;
599 
600     vi    = aj + diag[i] + 1;
601     nz    = ai[i+1] - diag[i] - 1;
602     while (nz--) {
603       oidx = 6*(*vi++);
604       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610       v  += 36;
611     }
612     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613     x[5+idx] = s6;
614     idx += 6;
615   }
616   /* backward solve the L^T */
617   for (i=n-1; i>=0; i--){
618     v    = aa + 36*diag[i] - 36;
619     vi   = aj + diag[i] - 1;
620     nz   = diag[i] - ai[i];
621     idt  = 6*i;
622     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623     s6 = x[5+idt];
624     while (nz--) {
625       idx   = 6*(*vi--);
626       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632       v -= 36;
633     }
634   }
635   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
636   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638   PetscFunctionReturn(0);
639 }
640 
641 #undef __FUNCT__
642 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
643 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
644 {
645   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
646   PetscErrorCode ierr;
647   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648   PetscInt       nz,idx,idt,j,i,oidx;
649   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
650   MatScalar      *aa=a->a,*v;
651   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
652   PetscScalar    *x,*b;
653 
654   PetscFunctionBegin;
655   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
657   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658 
659   /* forward solve the U^T */
660   idx = 0;
661   for (i=0; i<n; i++) {
662     v     = aa + bs2*diag[i];
663     /* multiply by the inverse of the block diagonal */
664     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665     x5 = x[4+idx]; x6 = x[5+idx];
666     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672     v -= bs2;
673 
674     vi    = aj + diag[i] - 1;
675     nz    = diag[i] - diag[i+1] - 1;
676     for(j=0;j>-nz;j--){
677       oidx = bs*vi[j];
678       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684       v  -= bs2;
685     }
686     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687     x[5+idx] = s6;
688     idx += bs;
689   }
690   /* backward solve the L^T */
691   for (i=n-1; i>=0; i--){
692     v    = aa + bs2*ai[i];
693     vi   = aj + ai[i];
694     nz   = ai[i+1] - ai[i];
695     idt  = bs*i;
696     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697     s6   = x[5+idt];
698     for(j=0;j<nz;j++){
699       idx   = bs*vi[j];
700       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706       v += bs2;
707     }
708   }
709   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
710   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712   PetscFunctionReturn(0);
713 }
714 
715 #undef __FUNCT__
716 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
717 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718 {
719   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
720   PetscErrorCode ierr;
721   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
722   PetscInt       *diag = a->diag,oidx;
723   MatScalar      *aa=a->a,*v;
724   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
725   PetscScalar    *x,*b;
726 
727   PetscFunctionBegin;
728   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
730   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731 
732   /* forward solve the U^T */
733   idx = 0;
734   for (i=0; i<n; i++) {
735 
736     v     = aa + 49*diag[i];
737     /* multiply by the inverse of the block diagonal */
738     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739     x6    = x[5+idx]; x7 = x[6+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747     v += 49;
748 
749     vi    = aj + diag[i] + 1;
750     nz    = ai[i+1] - diag[i] - 1;
751     while (nz--) {
752       oidx = 7*(*vi++);
753       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760       v  += 49;
761     }
762     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763     x[5+idx] = s6;x[6+idx] = s7;
764     idx += 7;
765   }
766   /* backward solve the L^T */
767   for (i=n-1; i>=0; i--){
768     v    = aa + 49*diag[i] - 49;
769     vi   = aj + diag[i] - 1;
770     nz   = diag[i] - ai[i];
771     idt  = 7*i;
772     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773     s6 = x[5+idt];s7 = x[6+idt];
774     while (nz--) {
775       idx   = 7*(*vi--);
776       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783       v -= 49;
784     }
785   }
786   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
787   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789   PetscFunctionReturn(0);
790 }
791 #undef __FUNCT__
792 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
793 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
794 {
795   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796   PetscErrorCode ierr;
797   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798   PetscInt       nz,idx,idt,j,i,oidx;
799   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
800   MatScalar      *aa=a->a,*v;
801   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
802   PetscScalar    *x,*b;
803 
804   PetscFunctionBegin;
805   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
807   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808 
809   /* forward solve the U^T */
810   idx = 0;
811   for (i=0; i<n; i++) {
812     v     = aa + bs2*diag[i];
813     /* multiply by the inverse of the block diagonal */
814     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823     v -= bs2;
824     vi    = aj + diag[i] - 1;
825     nz    = diag[i] - diag[i+1] - 1;
826     for(j=0;j>-nz;j--){
827       oidx = bs*vi[j];
828       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835       v  -= bs2;
836     }
837     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838     x[5+idx] = s6;  x[6+idx] = s7;
839     idx += bs;
840   }
841   /* backward solve the L^T */
842   for (i=n-1; i>=0; i--){
843     v    = aa + bs2*ai[i];
844     vi   = aj + ai[i];
845     nz   = ai[i+1] - ai[i];
846     idt  = bs*i;
847     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848     s6   = x[5+idt];  s7 = x[6+idt];
849     for(j=0;j<nz;j++){
850       idx   = bs*vi[j];
851       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858       v += bs2;
859     }
860   }
861   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
862   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864   PetscFunctionReturn(0);
865 }
866 
867 /*---------------------------------------------------------------------------------------------*/
868 #undef __FUNCT__
869 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
870 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871 {
872   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
873   IS             iscol=a->col,isrow=a->row;
874   PetscErrorCode ierr;
875   const PetscInt *r,*c,*rout,*cout;
876   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
877   PetscInt       *diag = a->diag;
878   MatScalar      *aa=a->a,*v;
879   PetscScalar    s1,*x,*b,*t;
880 
881   PetscFunctionBegin;
882   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
883   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
884   t  = a->solve_work;
885 
886   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
887   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
888 
889   /* copy the b into temp work space according to permutation */
890   for (i=0; i<n; i++) {
891     t[i] = b[c[i]];
892   }
893 
894   /* forward solve the U^T */
895   for (i=0; i<n; i++) {
896 
897     v     = aa + diag[i];
898     /* multiply by the inverse of the block diagonal */
899     s1    = (*v++)*t[i];
900     vi    = aj + diag[i] + 1;
901     nz    = ai[i+1] - diag[i] - 1;
902     while (nz--) {
903       t[*vi++]  -= (*v++)*s1;
904     }
905     t[i]   = s1;
906   }
907   /* backward solve the L^T */
908   for (i=n-1; i>=0; i--){
909     v    = aa + diag[i] - 1;
910     vi   = aj + diag[i] - 1;
911     nz   = diag[i] - ai[i];
912     s1   = t[i];
913     while (nz--) {
914       t[*vi--]   -=  (*v--)*s1;
915     }
916   }
917 
918   /* copy t into x according to permutation */
919   for (i=0; i<n; i++) {
920     x[r[i]]   = t[i];
921   }
922 
923   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
924   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
925   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
926   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
927   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
928   PetscFunctionReturn(0);
929 }
930 
931 #undef __FUNCT__
932 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
933 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
934 {
935   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
936   IS             iscol=a->col,isrow=a->row;
937   PetscErrorCode ierr;
938   const PetscInt *r,*c,*rout,*cout;
939   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
940   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
941   MatScalar      *aa=a->a,*v;
942   PetscScalar    s1,s2,x1,x2;
943   PetscScalar    *x,*b,*t;
944 
945   PetscFunctionBegin;
946   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
947   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
948   t  = a->solve_work;
949 
950   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
951   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
952 
953   /* copy the b into temp work space according to permutation */
954   ii = 0;
955   for (i=0; i<n; i++) {
956     ic      = 2*c[i];
957     t[ii]   = b[ic];
958     t[ii+1] = b[ic+1];
959     ii += 2;
960   }
961 
962   /* forward solve the U^T */
963   idx = 0;
964   for (i=0; i<n; i++) {
965 
966     v     = aa + 4*diag[i];
967     /* multiply by the inverse of the block diagonal */
968     x1    = t[idx];   x2 = t[1+idx];
969     s1 = v[0]*x1  +  v[1]*x2;
970     s2 = v[2]*x1  +  v[3]*x2;
971     v += 4;
972 
973     vi    = aj + diag[i] + 1;
974     nz    = ai[i+1] - diag[i] - 1;
975     while (nz--) {
976       oidx = 2*(*vi++);
977       t[oidx]   -= v[0]*s1  +  v[1]*s2;
978       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
979       v  += 4;
980     }
981     t[idx]   = s1;t[1+idx] = s2;
982     idx += 2;
983   }
984   /* backward solve the L^T */
985   for (i=n-1; i>=0; i--){
986     v    = aa + 4*diag[i] - 4;
987     vi   = aj + diag[i] - 1;
988     nz   = diag[i] - ai[i];
989     idt  = 2*i;
990     s1 = t[idt];  s2 = t[1+idt];
991     while (nz--) {
992       idx   = 2*(*vi--);
993       t[idx]   -=  v[0]*s1 +  v[1]*s2;
994       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
995       v -= 4;
996     }
997   }
998 
999   /* copy t into x according to permutation */
1000   ii = 0;
1001   for (i=0; i<n; i++) {
1002     ir      = 2*r[i];
1003     x[ir]   = t[ii];
1004     x[ir+1] = t[ii+1];
1005     ii += 2;
1006   }
1007 
1008   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1009   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1010   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1011   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1012   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1013   PetscFunctionReturn(0);
1014 }
1015 
1016 #undef __FUNCT__
1017 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
1018 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
1019 {
1020   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1021   PetscErrorCode ierr;
1022   IS             iscol=a->col,isrow=a->row;
1023   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1024   const PetscInt *r,*c,*rout,*cout;
1025   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1026   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1027   MatScalar      *aa=a->a,*v;
1028   PetscScalar    s1,s2,x1,x2;
1029   PetscScalar    *x,*b,*t;
1030 
1031   PetscFunctionBegin;
1032   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1033   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1034   t = a->solve_work;
1035 
1036   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1037   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1038 
1039   /* copy b into temp work space according to permutation */
1040   for(i=0;i<n;i++){
1041     ii = bs*i; ic = bs*c[i];
1042     t[ii] = b[ic]; t[ii+1] = b[ic+1];
1043   }
1044 
1045   /* forward solve the U^T */
1046   idx = 0;
1047   for (i=0; i<n; i++) {
1048     v     = aa + bs2*diag[i];
1049     /* multiply by the inverse of the block diagonal */
1050     x1 = t[idx];   x2 = t[1+idx];
1051     s1 = v[0]*x1  +  v[1]*x2;
1052     s2 = v[2]*x1  +  v[3]*x2;
1053     v -= bs2;
1054 
1055     vi    = aj + diag[i] - 1;
1056     nz    = diag[i] - diag[i+1] - 1;
1057     for(j=0;j>-nz;j--){
1058       oidx = bs*vi[j];
1059       t[oidx]   -= v[0]*s1  +  v[1]*s2;
1060       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
1061       v  -= bs2;
1062     }
1063     t[idx]   = s1;t[1+idx] = s2;
1064     idx += bs;
1065   }
1066   /* backward solve the L^T */
1067   for (i=n-1; i>=0; i--){
1068     v    = aa + bs2*ai[i];
1069     vi   = aj + ai[i];
1070     nz   = ai[i+1] - ai[i];
1071     idt  = bs*i;
1072     s1   = t[idt];  s2 = t[1+idt];
1073     for(j=0;j<nz;j++){
1074       idx   = bs*vi[j];
1075       t[idx]   -=  v[0]*s1 +  v[1]*s2;
1076       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
1077       v += bs2;
1078     }
1079   }
1080 
1081   /* copy t into x according to permutation */
1082   for(i=0;i<n;i++){
1083     ii = bs*i;  ir = bs*r[i];
1084     x[ir] = t[ii];  x[ir+1] = t[ii+1];
1085   }
1086 
1087   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1092   PetscFunctionReturn(0);
1093 }
1094 
1095 #undef __FUNCT__
1096 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1097 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1098 {
1099   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1100   IS             iscol=a->col,isrow=a->row;
1101   PetscErrorCode ierr;
1102   const PetscInt *r,*c,*rout,*cout;
1103   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1104   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1105   MatScalar      *aa=a->a,*v;
1106   PetscScalar    s1,s2,s3,x1,x2,x3;
1107   PetscScalar    *x,*b,*t;
1108 
1109   PetscFunctionBegin;
1110   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1111   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1112   t  = a->solve_work;
1113 
1114   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1115   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1116 
1117   /* copy the b into temp work space according to permutation */
1118   ii = 0;
1119   for (i=0; i<n; i++) {
1120     ic      = 3*c[i];
1121     t[ii]   = b[ic];
1122     t[ii+1] = b[ic+1];
1123     t[ii+2] = b[ic+2];
1124     ii += 3;
1125   }
1126 
1127   /* forward solve the U^T */
1128   idx = 0;
1129   for (i=0; i<n; i++) {
1130 
1131     v     = aa + 9*diag[i];
1132     /* multiply by the inverse of the block diagonal */
1133     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1134     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1135     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1136     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1137     v += 9;
1138 
1139     vi    = aj + diag[i] + 1;
1140     nz    = ai[i+1] - diag[i] - 1;
1141     while (nz--) {
1142       oidx = 3*(*vi++);
1143       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1144       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1145       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1146       v  += 9;
1147     }
1148     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1149     idx += 3;
1150   }
1151   /* backward solve the L^T */
1152   for (i=n-1; i>=0; i--){
1153     v    = aa + 9*diag[i] - 9;
1154     vi   = aj + diag[i] - 1;
1155     nz   = diag[i] - ai[i];
1156     idt  = 3*i;
1157     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1158     while (nz--) {
1159       idx   = 3*(*vi--);
1160       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1161       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1162       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1163       v -= 9;
1164     }
1165   }
1166 
1167   /* copy t into x according to permutation */
1168   ii = 0;
1169   for (i=0; i<n; i++) {
1170     ir      = 3*r[i];
1171     x[ir]   = t[ii];
1172     x[ir+1] = t[ii+1];
1173     x[ir+2] = t[ii+2];
1174     ii += 3;
1175   }
1176 
1177   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1178   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1179   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1180   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1181   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1182   PetscFunctionReturn(0);
1183 }
1184 
1185 #undef __FUNCT__
1186 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1187 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1188 {
1189   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1190   PetscErrorCode ierr;
1191   IS             iscol=a->col,isrow=a->row;
1192   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1193   const PetscInt *r,*c,*rout,*cout;
1194   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1195   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1196   MatScalar      *aa=a->a,*v;
1197   PetscScalar    s1,s2,s3,x1,x2,x3;
1198   PetscScalar    *x,*b,*t;
1199 
1200   PetscFunctionBegin;
1201   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1202   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1203   t = a->solve_work;
1204 
1205   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1206   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1207 
1208   /* copy b into temp work space according to permutation */
1209   for(i=0;i<n;i++){
1210     ii = bs*i; ic = bs*c[i];
1211     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
1212   }
1213 
1214   /* forward solve the U^T */
1215   idx = 0;
1216   for (i=0; i<n; i++) {
1217     v     = aa + bs2*diag[i];
1218     /* multiply by the inverse of the block diagonal */
1219     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1220     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1221     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1222     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1223     v -= bs2;
1224 
1225     vi    = aj + diag[i] - 1;
1226     nz    = diag[i] - diag[i+1] - 1;
1227     for(j=0;j>-nz;j--){
1228       oidx = bs*vi[j];
1229       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1230       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1231       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1232       v  -= bs2;
1233     }
1234     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
1235     idx += bs;
1236   }
1237   /* backward solve the L^T */
1238   for (i=n-1; i>=0; i--){
1239     v    = aa + bs2*ai[i];
1240     vi   = aj + ai[i];
1241     nz   = ai[i+1] - ai[i];
1242     idt  = bs*i;
1243     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
1244     for(j=0;j<nz;j++){
1245       idx   = bs*vi[j];
1246       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1247       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1248       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1249       v += bs2;
1250     }
1251   }
1252 
1253   /* copy t into x according to permutation */
1254   for(i=0;i<n;i++){
1255     ii = bs*i;  ir = bs*r[i];
1256     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
1257   }
1258 
1259   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1260   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1261   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1262   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1263   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1264   PetscFunctionReturn(0);
1265 }
1266 
1267 #undef __FUNCT__
1268 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1269 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1270 {
1271   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1272   IS             iscol=a->col,isrow=a->row;
1273   PetscErrorCode ierr;
1274   const PetscInt *r,*c,*rout,*cout;
1275   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1276   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1277   MatScalar      *aa=a->a,*v;
1278   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1279   PetscScalar    *x,*b,*t;
1280 
1281   PetscFunctionBegin;
1282   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1283   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1284   t  = a->solve_work;
1285 
1286   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1287   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1288 
1289   /* copy the b into temp work space according to permutation */
1290   ii = 0;
1291   for (i=0; i<n; i++) {
1292     ic      = 4*c[i];
1293     t[ii]   = b[ic];
1294     t[ii+1] = b[ic+1];
1295     t[ii+2] = b[ic+2];
1296     t[ii+3] = b[ic+3];
1297     ii += 4;
1298   }
1299 
1300   /* forward solve the U^T */
1301   idx = 0;
1302   for (i=0; i<n; i++) {
1303 
1304     v     = aa + 16*diag[i];
1305     /* multiply by the inverse of the block diagonal */
1306     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1307     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1308     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1309     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1310     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1311     v += 16;
1312 
1313     vi    = aj + diag[i] + 1;
1314     nz    = ai[i+1] - diag[i] - 1;
1315     while (nz--) {
1316       oidx = 4*(*vi++);
1317       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1318       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1319       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1320       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1321       v  += 16;
1322     }
1323     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1324     idx += 4;
1325   }
1326   /* backward solve the L^T */
1327   for (i=n-1; i>=0; i--){
1328     v    = aa + 16*diag[i] - 16;
1329     vi   = aj + diag[i] - 1;
1330     nz   = diag[i] - ai[i];
1331     idt  = 4*i;
1332     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1333     while (nz--) {
1334       idx   = 4*(*vi--);
1335       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1336       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1337       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1338       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1339       v -= 16;
1340     }
1341   }
1342 
1343   /* copy t into x according to permutation */
1344   ii = 0;
1345   for (i=0; i<n; i++) {
1346     ir      = 4*r[i];
1347     x[ir]   = t[ii];
1348     x[ir+1] = t[ii+1];
1349     x[ir+2] = t[ii+2];
1350     x[ir+3] = t[ii+3];
1351     ii += 4;
1352   }
1353 
1354   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1355   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1356   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1357   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1358   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1359   PetscFunctionReturn(0);
1360 }
1361 
1362 #undef __FUNCT__
1363 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1364 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1365 {
1366   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1367   PetscErrorCode ierr;
1368   IS             iscol=a->col,isrow=a->row;
1369   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1370   const PetscInt *r,*c,*rout,*cout;
1371   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1372   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1373   MatScalar      *aa=a->a,*v;
1374   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
1375   PetscScalar    *x,*b,*t;
1376 
1377   PetscFunctionBegin;
1378   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1379   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1380   t = a->solve_work;
1381 
1382   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1383   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1384 
1385   /* copy b into temp work space according to permutation */
1386   for(i=0;i<n;i++){
1387     ii = bs*i; ic = bs*c[i];
1388     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1389   }
1390 
1391   /* forward solve the U^T */
1392   idx = 0;
1393   for (i=0; i<n; i++) {
1394     v     = aa + bs2*diag[i];
1395     /* multiply by the inverse of the block diagonal */
1396     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
1397     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1398     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1399     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1400     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1401     v -= bs2;
1402 
1403     vi    = aj + diag[i] - 1;
1404     nz    = diag[i] - diag[i+1] - 1;
1405     for(j=0;j>-nz;j--){
1406       oidx = bs*vi[j];
1407       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1408       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1409       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1410       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1411       v  -= bs2;
1412     }
1413     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
1414     idx += bs;
1415   }
1416   /* backward solve the L^T */
1417   for (i=n-1; i>=0; i--){
1418     v    = aa + bs2*ai[i];
1419     vi   = aj + ai[i];
1420     nz   = ai[i+1] - ai[i];
1421     idt  = bs*i;
1422     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
1423     for(j=0;j<nz;j++){
1424       idx   = bs*vi[j];
1425       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
1426       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
1427       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
1428       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
1429       v += bs2;
1430     }
1431   }
1432 
1433   /* copy t into x according to permutation */
1434   for(i=0;i<n;i++){
1435     ii = bs*i;  ir = bs*r[i];
1436     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1437   }
1438 
1439   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1440   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1441   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1442   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1443   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1444   PetscFunctionReturn(0);
1445 }
1446 
1447 #undef __FUNCT__
1448 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1449 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1450 {
1451   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1452   IS             iscol=a->col,isrow=a->row;
1453   PetscErrorCode ierr;
1454   const PetscInt *r,*c,*rout,*cout;
1455   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1456   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1457   MatScalar      *aa=a->a,*v;
1458   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1459   PetscScalar    *x,*b,*t;
1460 
1461   PetscFunctionBegin;
1462   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1463   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1464   t  = a->solve_work;
1465 
1466   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1467   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1468 
1469   /* copy the b into temp work space according to permutation */
1470   ii = 0;
1471   for (i=0; i<n; i++) {
1472     ic      = 5*c[i];
1473     t[ii]   = b[ic];
1474     t[ii+1] = b[ic+1];
1475     t[ii+2] = b[ic+2];
1476     t[ii+3] = b[ic+3];
1477     t[ii+4] = b[ic+4];
1478     ii += 5;
1479   }
1480 
1481   /* forward solve the U^T */
1482   idx = 0;
1483   for (i=0; i<n; i++) {
1484 
1485     v     = aa + 25*diag[i];
1486     /* multiply by the inverse of the block diagonal */
1487     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1488     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1489     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1490     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1491     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1492     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1493     v += 25;
1494 
1495     vi    = aj + diag[i] + 1;
1496     nz    = ai[i+1] - diag[i] - 1;
1497     while (nz--) {
1498       oidx = 5*(*vi++);
1499       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1500       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1501       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1502       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1503       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1504       v  += 25;
1505     }
1506     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1507     idx += 5;
1508   }
1509   /* backward solve the L^T */
1510   for (i=n-1; i>=0; i--){
1511     v    = aa + 25*diag[i] - 25;
1512     vi   = aj + diag[i] - 1;
1513     nz   = diag[i] - ai[i];
1514     idt  = 5*i;
1515     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1516     while (nz--) {
1517       idx   = 5*(*vi--);
1518       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1519       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1520       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1521       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1522       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1523       v -= 25;
1524     }
1525   }
1526 
1527   /* copy t into x according to permutation */
1528   ii = 0;
1529   for (i=0; i<n; i++) {
1530     ir      = 5*r[i];
1531     x[ir]   = t[ii];
1532     x[ir+1] = t[ii+1];
1533     x[ir+2] = t[ii+2];
1534     x[ir+3] = t[ii+3];
1535     x[ir+4] = t[ii+4];
1536     ii += 5;
1537   }
1538 
1539   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1540   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1541   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1542   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1543   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1544   PetscFunctionReturn(0);
1545 }
1546 
1547 #undef __FUNCT__
1548 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1549 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1550 {
1551   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1552   PetscErrorCode ierr;
1553   IS             iscol=a->col,isrow=a->row;
1554   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1555   const PetscInt *r,*c,*rout,*cout;
1556   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1557   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1558   MatScalar      *aa=a->a,*v;
1559   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1560   PetscScalar    *x,*b,*t;
1561 
1562   PetscFunctionBegin;
1563   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1564   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1565   t = a->solve_work;
1566 
1567   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1568   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1569 
1570   /* copy b into temp work space according to permutation */
1571   for(i=0;i<n;i++){
1572     ii = bs*i; ic = bs*c[i];
1573     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1574     t[ii+4] = b[ic+4];
1575   }
1576 
1577   /* forward solve the U^T */
1578   idx = 0;
1579   for (i=0; i<n; i++) {
1580     v     = aa + bs2*diag[i];
1581     /* multiply by the inverse of the block diagonal */
1582     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1583     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1584     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1585     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1586     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1587     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1588     v -= bs2;
1589 
1590     vi    = aj + diag[i] - 1;
1591     nz    = diag[i] - diag[i+1] - 1;
1592     for(j=0;j>-nz;j--){
1593       oidx = bs*vi[j];
1594       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1595       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1596       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1597       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1598       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1599       v  -= bs2;
1600     }
1601     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1602     idx += bs;
1603   }
1604   /* backward solve the L^T */
1605   for (i=n-1; i>=0; i--){
1606     v    = aa + bs2*ai[i];
1607     vi   = aj + ai[i];
1608     nz   = ai[i+1] - ai[i];
1609     idt  = bs*i;
1610     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1611     for(j=0;j<nz;j++){
1612       idx   = bs*vi[j];
1613       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1614       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1615       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1616       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1617       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1618       v += bs2;
1619     }
1620   }
1621 
1622   /* copy t into x according to permutation */
1623   for(i=0;i<n;i++){
1624     ii = bs*i;  ir = bs*r[i];
1625     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1626     x[ir+4] = t[ii+4];
1627   }
1628 
1629   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1630   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1631   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 #undef __FUNCT__
1638 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1639 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1640 {
1641   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1642   IS             iscol=a->col,isrow=a->row;
1643   PetscErrorCode ierr;
1644   const PetscInt *r,*c,*rout,*cout;
1645   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1646   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1647   MatScalar      *aa=a->a,*v;
1648   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1649   PetscScalar    *x,*b,*t;
1650 
1651   PetscFunctionBegin;
1652   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1653   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1654   t  = a->solve_work;
1655 
1656   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1657   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1658 
1659   /* copy the b into temp work space according to permutation */
1660   ii = 0;
1661   for (i=0; i<n; i++) {
1662     ic      = 6*c[i];
1663     t[ii]   = b[ic];
1664     t[ii+1] = b[ic+1];
1665     t[ii+2] = b[ic+2];
1666     t[ii+3] = b[ic+3];
1667     t[ii+4] = b[ic+4];
1668     t[ii+5] = b[ic+5];
1669     ii += 6;
1670   }
1671 
1672   /* forward solve the U^T */
1673   idx = 0;
1674   for (i=0; i<n; i++) {
1675 
1676     v     = aa + 36*diag[i];
1677     /* multiply by the inverse of the block diagonal */
1678     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1679     x6    = t[5+idx];
1680     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1681     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1682     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1683     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1684     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1685     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1686     v += 36;
1687 
1688     vi    = aj + diag[i] + 1;
1689     nz    = ai[i+1] - diag[i] - 1;
1690     while (nz--) {
1691       oidx = 6*(*vi++);
1692       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1693       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1694       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1695       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1696       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1697       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1698       v  += 36;
1699     }
1700     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1701     t[5+idx] = s6;
1702     idx += 6;
1703   }
1704   /* backward solve the L^T */
1705   for (i=n-1; i>=0; i--){
1706     v    = aa + 36*diag[i] - 36;
1707     vi   = aj + diag[i] - 1;
1708     nz   = diag[i] - ai[i];
1709     idt  = 6*i;
1710     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1711     s6 = t[5+idt];
1712     while (nz--) {
1713       idx   = 6*(*vi--);
1714       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1715       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1716       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1717       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1718       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1719       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1720       v -= 36;
1721     }
1722   }
1723 
1724   /* copy t into x according to permutation */
1725   ii = 0;
1726   for (i=0; i<n; i++) {
1727     ir      = 6*r[i];
1728     x[ir]   = t[ii];
1729     x[ir+1] = t[ii+1];
1730     x[ir+2] = t[ii+2];
1731     x[ir+3] = t[ii+3];
1732     x[ir+4] = t[ii+4];
1733     x[ir+5] = t[ii+5];
1734     ii += 6;
1735   }
1736 
1737   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1742   PetscFunctionReturn(0);
1743 }
1744 
1745 #undef __FUNCT__
1746 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1747 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1748 {
1749   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1750   PetscErrorCode ierr;
1751   IS             iscol=a->col,isrow=a->row;
1752   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1753   const PetscInt *r,*c,*rout,*cout;
1754   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1755   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1756   MatScalar      *aa=a->a,*v;
1757   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1758   PetscScalar    *x,*b,*t;
1759 
1760   PetscFunctionBegin;
1761   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1762   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1763   t = a->solve_work;
1764 
1765   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1766   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1767 
1768   /* copy b into temp work space according to permutation */
1769   for(i=0;i<n;i++){
1770     ii = bs*i; ic = bs*c[i];
1771     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1772     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
1773   }
1774 
1775   /* forward solve the U^T */
1776   idx = 0;
1777   for (i=0; i<n; i++) {
1778     v     = aa + bs2*diag[i];
1779     /* multiply by the inverse of the block diagonal */
1780     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1781     x6    = t[5+idx];
1782     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1783     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1784     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1785     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1786     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1787     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1788     v -= bs2;
1789 
1790     vi    = aj + diag[i] - 1;
1791     nz    = diag[i] - diag[i+1] - 1;
1792     for(j=0;j>-nz;j--){
1793       oidx = bs*vi[j];
1794       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1795       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1796       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1797       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1798       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1799       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1800       v  -= bs2;
1801     }
1802     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
1803     t[5+idx] = s6;
1804     idx += bs;
1805   }
1806   /* backward solve the L^T */
1807   for (i=n-1; i>=0; i--){
1808     v    = aa + bs2*ai[i];
1809     vi   = aj + ai[i];
1810     nz   = ai[i+1] - ai[i];
1811     idt  = bs*i;
1812     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
1813     s6   = t[5+idt];
1814    for(j=0;j<nz;j++){
1815       idx   = bs*vi[j];
1816       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1817       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1818       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1819       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1820       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1821       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1822       v += bs2;
1823     }
1824   }
1825 
1826   /* copy t into x according to permutation */
1827   for(i=0;i<n;i++){
1828     ii = bs*i;  ir = bs*r[i];
1829     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
1830     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
1831   }
1832 
1833   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1834   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1835   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1836   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1837   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1838   PetscFunctionReturn(0);
1839 }
1840 
1841 #undef __FUNCT__
1842 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1843 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1844 {
1845   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1846   IS             iscol=a->col,isrow=a->row;
1847   PetscErrorCode ierr;
1848   const PetscInt *r,*c,*rout,*cout;
1849   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1850   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1851   MatScalar      *aa=a->a,*v;
1852   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1853   PetscScalar    *x,*b,*t;
1854 
1855   PetscFunctionBegin;
1856   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1857   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1858   t  = a->solve_work;
1859 
1860   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1861   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1862 
1863   /* copy the b into temp work space according to permutation */
1864   ii = 0;
1865   for (i=0; i<n; i++) {
1866     ic      = 7*c[i];
1867     t[ii]   = b[ic];
1868     t[ii+1] = b[ic+1];
1869     t[ii+2] = b[ic+2];
1870     t[ii+3] = b[ic+3];
1871     t[ii+4] = b[ic+4];
1872     t[ii+5] = b[ic+5];
1873     t[ii+6] = b[ic+6];
1874     ii += 7;
1875   }
1876 
1877   /* forward solve the U^T */
1878   idx = 0;
1879   for (i=0; i<n; i++) {
1880 
1881     v     = aa + 49*diag[i];
1882     /* multiply by the inverse of the block diagonal */
1883     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1884     x6    = t[5+idx]; x7 = t[6+idx];
1885     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1886     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1887     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1888     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1889     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1890     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1891     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1892     v += 49;
1893 
1894     vi    = aj + diag[i] + 1;
1895     nz    = ai[i+1] - diag[i] - 1;
1896     while (nz--) {
1897       oidx = 7*(*vi++);
1898       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1899       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1900       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1901       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1902       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1903       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1904       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1905       v  += 49;
1906     }
1907     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1908     t[5+idx] = s6;t[6+idx] = s7;
1909     idx += 7;
1910   }
1911   /* backward solve the L^T */
1912   for (i=n-1; i>=0; i--){
1913     v    = aa + 49*diag[i] - 49;
1914     vi   = aj + diag[i] - 1;
1915     nz   = diag[i] - ai[i];
1916     idt  = 7*i;
1917     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1918     s6 = t[5+idt];s7 = t[6+idt];
1919     while (nz--) {
1920       idx   = 7*(*vi--);
1921       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1922       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1923       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1924       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1925       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1926       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1927       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1928       v -= 49;
1929     }
1930   }
1931 
1932   /* copy t into x according to permutation */
1933   ii = 0;
1934   for (i=0; i<n; i++) {
1935     ir      = 7*r[i];
1936     x[ir]   = t[ii];
1937     x[ir+1] = t[ii+1];
1938     x[ir+2] = t[ii+2];
1939     x[ir+3] = t[ii+3];
1940     x[ir+4] = t[ii+4];
1941     x[ir+5] = t[ii+5];
1942     x[ir+6] = t[ii+6];
1943     ii += 7;
1944   }
1945 
1946   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1947   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1948   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1949   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1950   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1951   PetscFunctionReturn(0);
1952 }
1953 #undef __FUNCT__
1954 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1955 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1956 {
1957   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1958   PetscErrorCode ierr;
1959   IS             iscol=a->col,isrow=a->row;
1960   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1961   const PetscInt *r,*c,*rout,*cout;
1962   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
1963   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1964   MatScalar      *aa=a->a,*v;
1965   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1966   PetscScalar    *x,*b,*t;
1967 
1968   PetscFunctionBegin;
1969   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1970   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1971   t = a->solve_work;
1972 
1973   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1974   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1975 
1976   /* copy b into temp work space according to permutation */
1977   for(i=0;i<n;i++){
1978     ii = bs*i; ic = bs*c[i];
1979     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
1980     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
1981   }
1982 
1983   /* forward solve the U^T */
1984   idx = 0;
1985   for (i=0; i<n; i++) {
1986     v     = aa + bs2*diag[i];
1987     /* multiply by the inverse of the block diagonal */
1988     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1989     x6    = t[5+idx]; x7 = t[6+idx];
1990     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1991     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1992     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1993     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1994     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1995     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1996     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1997     v -= bs2;
1998 
1999     vi    = aj + diag[i] - 1;
2000     nz    = diag[i] - diag[i+1] - 1;
2001     for(j=0;j>-nz;j--){
2002       oidx = bs*vi[j];
2003       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2004       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2005       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2006       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2007       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2008       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2009       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2010       v  -= bs2;
2011     }
2012     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
2013     t[5+idx] = s6;  t[6+idx] = s7;
2014     idx += bs;
2015   }
2016   /* backward solve the L^T */
2017   for (i=n-1; i>=0; i--){
2018     v    = aa + bs2*ai[i];
2019     vi   = aj + ai[i];
2020     nz   = ai[i+1] - ai[i];
2021     idt  = bs*i;
2022     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
2023     s6   = t[5+idt];  s7 = t[6+idt];
2024    for(j=0;j<nz;j++){
2025       idx   = bs*vi[j];
2026       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
2027       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
2028       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
2029       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
2030       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
2031       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
2032       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
2033       v += bs2;
2034     }
2035   }
2036 
2037   /* copy t into x according to permutation */
2038   for(i=0;i<n;i++){
2039     ii = bs*i;  ir = bs*r[i];
2040     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
2041     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
2042   }
2043 
2044   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2045   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2046   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2047   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2048   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2049   PetscFunctionReturn(0);
2050 }
2051 
2052 /* ----------------------------------------------------------- */
2053 #undef __FUNCT__
2054 #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2055 PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2056 {
2057   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2058   IS             iscol=a->col,isrow=a->row;
2059   PetscErrorCode ierr;
2060   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2061   PetscInt       i,n=a->mbs;
2062   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
2063   MatScalar      *aa=a->a,*v;
2064   PetscScalar    *x,*b,*s,*t,*ls;
2065 
2066   PetscFunctionBegin;
2067   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2068   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2069   t  = a->solve_work;
2070 
2071   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2072   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2073 
2074   /* forward solve the lower triangular */
2075   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2076   for (i=1; i<n; i++) {
2077     v   = aa + bs2*ai[i];
2078     vi  = aj + ai[i];
2079     nz  = a->diag[i] - ai[i];
2080     s = t + bs*i;
2081     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
2082     while (nz--) {
2083       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
2084       v += bs2;
2085     }
2086   }
2087   /* backward solve the upper triangular */
2088   ls = a->solve_work + A->cmap->n;
2089   for (i=n-1; i>=0; i--){
2090     v   = aa + bs2*(a->diag[i] + 1);
2091     vi  = aj + a->diag[i] + 1;
2092     nz  = ai[i+1] - a->diag[i] - 1;
2093     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2094     while (nz--) {
2095       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
2096       v += bs2;
2097     }
2098     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2099     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2100   }
2101 
2102   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2103   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2104   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2105   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2106   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2107   PetscFunctionReturn(0);
2108 }
2109 
2110 /* ----------------------------------------------------------- */
2111 #undef __FUNCT__
2112 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2113 PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
2114 {
2115   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2116   IS                iscol=a->col,isrow=a->row;
2117   PetscErrorCode    ierr;
2118   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2119   PetscInt          i,n=a->mbs,j;
2120   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2121   const MatScalar   *aa=a->a,*v;
2122   PetscScalar       *x,*t,*ls;
2123   const PetscScalar *b;
2124   PetscFunctionBegin;
2125   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2126   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2127   t    = a->solve_work;
2128 
2129   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2130   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2131 
2132   /* copy the b into temp work space according to permutation */
2133   for (i=0; i<n; i++) {
2134     for (j=0; j<bs; j++) {
2135       t[i*bs+j] = b[c[i]*bs+j];
2136     }
2137   }
2138 
2139 
2140   /* forward solve the upper triangular transpose */
2141   ls = a->solve_work + A->cmap->n;
2142   for (i=0; i<n; i++){
2143     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2144     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
2145     v   = aa + bs2*(a->diag[i] + 1);
2146     vi  = aj + a->diag[i] + 1;
2147     nz  = ai[i+1] - a->diag[i] - 1;
2148     while (nz--) {
2149       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2150       v += bs2;
2151     }
2152   }
2153 
2154   /* backward solve the lower triangular transpose */
2155   for (i=n-1; i>=0; i--) {
2156     v   = aa + bs2*ai[i];
2157     vi  = aj + ai[i];
2158     nz  = a->diag[i] - ai[i];
2159     while (nz--) {
2160       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
2161       v += bs2;
2162     }
2163   }
2164 
2165   /* copy t into x according to permutation */
2166   for (i=0; i<n; i++) {
2167     for (j=0; j<bs; j++) {
2168       x[bs*r[i]+j]   = t[bs*i+j];
2169     }
2170   }
2171 
2172   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2173   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2174   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2175   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2176   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2177   PetscFunctionReturn(0);
2178 }
2179 
2180 #undef __FUNCT__
2181 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
2182 PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
2183 {
2184   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2185   IS                iscol=a->col,isrow=a->row;
2186   PetscErrorCode    ierr;
2187   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2188   PetscInt          i,n=a->mbs,j;
2189   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
2190   const MatScalar   *aa=a->a,*v;
2191   PetscScalar       *x,*t,*ls;
2192   const PetscScalar *b;
2193   PetscFunctionBegin;
2194   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2195   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2196   t    = a->solve_work;
2197 
2198   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2199   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2200 
2201   /* copy the b into temp work space according to permutation */
2202   for (i=0; i<n; i++) {
2203     for (j=0; j<bs; j++) {
2204       t[i*bs+j] = b[c[i]*bs+j];
2205     }
2206   }
2207 
2208 
2209   /* forward solve the upper triangular transpose */
2210   ls = a->solve_work + A->cmap->n;
2211   for (i=0; i<n; i++){
2212     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
2213     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
2214     v   = aa + bs2*(diag[i] - 1);
2215     vi  = aj + diag[i] - 1;
2216     nz  = diag[i] - diag[i+1] - 1;
2217     for(j=0;j>-nz;j--){
2218       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2219       v -= bs2;
2220     }
2221   }
2222 
2223   /* backward solve the lower triangular transpose */
2224   for (i=n-1; i>=0; i--) {
2225     v   = aa + bs2*ai[i];
2226     vi  = aj + ai[i];
2227     nz  = ai[i+1] - ai[i];
2228     for(j=0;j<nz;j++){
2229       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
2230       v += bs2;
2231     }
2232   }
2233 
2234   /* copy t into x according to permutation */
2235   for (i=0; i<n; i++) {
2236     for (j=0; j<bs; j++) {
2237       x[bs*r[i]+j]   = t[bs*i+j];
2238     }
2239   }
2240 
2241   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2242   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2243   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2244   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2245   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
2246   PetscFunctionReturn(0);
2247 }
2248 
2249 /* bs = 15 for PFLOTRAN */
2250 #undef __FUNCT__
2251 #define __FUNCT__ "MatSolve_SeqBAIJ_15"
2252 PetscErrorCode MatSolve_SeqBAIJ_15(Mat A,Vec bb,Vec xx)
2253 {
2254   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2255   IS             iscol=a->col,isrow=a->row;
2256   PetscErrorCode ierr;
2257   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi,bs=A->rmap->bs,bs2=a->bs2;
2258   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
2259   MatScalar      *aa=a->a,*v;
2260   PetscScalar    s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
2261   PetscScalar    x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
2262   PetscScalar    *x,*b,*t;
2263 
2264   PetscFunctionBegin;
2265   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2266   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2267   t  = a->solve_work;
2268 
2269   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2270   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2271 
2272   /* forward solve the lower triangular */
2273   idx    = bs*r[0];
2274   t[0]  = b[idx];    t[1]  = b[1+idx];  t[2]  = b[2+idx];  t[3]  = b[3+idx];  t[4]  = b[4+idx];
2275   t[5]  = b[5+idx];  t[6]  = b[6+idx];  t[7]  = b[7+idx];  t[8]  = b[8+idx];  t[9]  = b[9+idx];
2276   t[10] = b[10+idx]; t[11] = b[11+idx]; t[12] = b[12+idx]; t[13] = b[13+idx]; t[14] = b[14+idx];
2277 
2278   for (i=1; i<n; i++) {
2279     v     = aa + bs2*ai[i];
2280     vi    = aj + ai[i];
2281     nz    = ai[i+1] - ai[i];
2282     idx   = bs*r[i];
2283     s1   = b[idx];    s2  = b[1+idx];  s3  = b[2+idx];  s4  = b[3+idx];  s5  = b[4+idx];
2284     s6   = b[5+idx];  s7  = b[6+idx];  s8  = b[7+idx];  s9  = b[8+idx];  s10 = b[9+idx];
2285     s11  = b[10+idx]; s12 = b[11+idx]; s13 = b[12+idx]; s14 = b[13+idx]; s15 = b[14+idx];
2286     for(m=0;m<nz;m++){
2287       idx   = bs*vi[m];
2288       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2289       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2290       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2291 
2292       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2293       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2294       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2295       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2296       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2297       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2298       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2299       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2300       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2301       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2302       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2303       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2304       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2305       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2306       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2307 
2308       v += bs2;
2309     }
2310     idx = bs*i;
2311     t[idx]    = s1;  t[1+idx]  = s2;  t[2+idx]  = s3;  t[3+idx]  = s4;  t[4+idx]  = s5;
2312     t[5+idx]  = s6;  t[6+idx]  = s7;  t[7+idx]  = s8;  t[8+idx]  = s9;  t[9+idx]  = s10;
2313     t[10+idx] = s11; t[11+idx] = s12; t[12+idx] = s13; t[13+idx] = s14; t[14+idx] = s15;
2314 
2315   }
2316   /* backward solve the upper triangular */
2317   for (i=n-1; i>=0; i--){
2318     v    = aa + bs2*(adiag[i+1]+1);
2319     vi   = aj + adiag[i+1]+1;
2320     nz   = adiag[i] - adiag[i+1] - 1;
2321     idt  = bs*i;
2322     s1   = t[idt];     s2  = t[1+idt];  s3  = t[2+idt];  s4  = t[3+idt];  s5  = t[4+idt];
2323     s6   = t[5+idt];   s7  = t[6+idt];  s8  = t[7+idt];  s9  = t[8+idt];  s10 = t[9+idt];
2324     s11  = t[10+idt]; s12  = t[11+idt]; s13 = t[12+idt]; s14 = t[13+idt]; s15 = t[14+idt];
2325 
2326     for(m=0;m<nz;m++){
2327       idx   = bs*vi[m];
2328       x1   = t[idx];     x2  = t[1+idx];  x3  = t[2+idx];  x4  = t[3+idx];  x5  = t[4+idx];
2329       x6   = t[5+idx];   x7  = t[6+idx];  x8  = t[7+idx];  x9  = t[8+idx];  x10 = t[9+idx];
2330       x11  = t[10+idx]; x12  = t[11+idx]; x13 = t[12+idx]; x14 = t[13+idx]; x15 = t[14+idx];
2331 
2332       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
2333       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
2334       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
2335       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
2336       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
2337       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
2338       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
2339       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
2340       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
2341       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
2342       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
2343       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
2344       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
2345       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
2346       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
2347 
2348       v += bs2;
2349     }
2350     idc = bs*c[i];
2351 
2352     x[idc]    = t[idt]    = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
2353     x[1+idc]  = t[1+idt]  = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
2354     x[2+idc]  = t[2+idt]  = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
2355     x[3+idc]  = t[3+idt]  = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
2356     x[4+idc]  = t[4+idt]  = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
2357     x[5+idc]  = t[5+idt]  = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
2358     x[6+idc]  = t[6+idt]  = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
2359     x[7+idc]  = t[7+idt]  = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
2360     x[8+idc]  = t[8+idt]  = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
2361     x[9+idc]  = t[9+idt]  = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
2362     x[10+idc] = t[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
2363     x[11+idc] = t[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
2364     x[12+idc] = t[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
2365     x[13+idc] = t[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
2366     x[14+idc] = t[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
2367 
2368   }
2369 
2370   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2371   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2372   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2373   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2374   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2375   PetscFunctionReturn(0);
2376 }
2377 
2378 #undef __FUNCT__
2379 #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2380 PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
2381 {
2382   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2383   IS             iscol=a->col,isrow=a->row;
2384   PetscErrorCode ierr;
2385   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
2386   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
2387   MatScalar      *aa=a->a,*v;
2388   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2389   PetscScalar    *x,*b,*t;
2390 
2391   PetscFunctionBegin;
2392   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2393   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2394   t  = a->solve_work;
2395 
2396   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2397   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2398 
2399   /* forward solve the lower triangular */
2400   idx    = 7*(*r++);
2401   t[0] = b[idx];   t[1] = b[1+idx];
2402   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2403   t[5] = b[5+idx]; t[6] = b[6+idx];
2404 
2405   for (i=1; i<n; i++) {
2406     v     = aa + 49*ai[i];
2407     vi    = aj + ai[i];
2408     nz    = diag[i] - ai[i];
2409     idx   = 7*(*r++);
2410     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2411     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2412     while (nz--) {
2413       idx   = 7*(*vi++);
2414       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2415       x4    = t[3+idx];x5 = t[4+idx];
2416       x6    = t[5+idx];x7 = t[6+idx];
2417       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2418       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2419       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2420       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2421       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2422       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2423       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2424       v += 49;
2425     }
2426     idx = 7*i;
2427     t[idx]   = s1;t[1+idx] = s2;
2428     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2429     t[5+idx] = s6;t[6+idx] = s7;
2430   }
2431   /* backward solve the upper triangular */
2432   for (i=n-1; i>=0; i--){
2433     v    = aa + 49*diag[i] + 49;
2434     vi   = aj + diag[i] + 1;
2435     nz   = ai[i+1] - diag[i] - 1;
2436     idt  = 7*i;
2437     s1 = t[idt];  s2 = t[1+idt];
2438     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2439     s6 = t[5+idt];s7 = t[6+idt];
2440     while (nz--) {
2441       idx   = 7*(*vi++);
2442       x1    = t[idx];   x2 = t[1+idx];
2443       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2444       x6    = t[5+idx]; x7 = t[6+idx];
2445       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2446       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2447       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2448       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2449       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2450       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2451       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2452       v += 49;
2453     }
2454     idc = 7*(*c--);
2455     v   = aa + 49*diag[i];
2456     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2457                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2458     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2459                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2460     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2461                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2462     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2463                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2464     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2465                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2466     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2467                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2468     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2469                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2470   }
2471 
2472   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2473   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2474   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2475   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2476   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2477   PetscFunctionReturn(0);
2478 }
2479 
2480 #undef __FUNCT__
2481 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
2482 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
2483 {
2484   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2485   IS             iscol=a->col,isrow=a->row;
2486   PetscErrorCode ierr;
2487   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
2488   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
2489   MatScalar      *aa=a->a,*v;
2490   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2491   PetscScalar    *x,*b,*t;
2492 
2493   PetscFunctionBegin;
2494   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2495   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2496   t  = a->solve_work;
2497 
2498   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2499   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2500 
2501   /* forward solve the lower triangular */
2502   idx    = 7*r[0];
2503   t[0] = b[idx];   t[1] = b[1+idx];
2504   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2505   t[5] = b[5+idx]; t[6] = b[6+idx];
2506 
2507   for (i=1; i<n; i++) {
2508     v     = aa + 49*ai[i];
2509     vi    = aj + ai[i];
2510     nz    = ai[i+1] - ai[i];
2511     idx   = 7*r[i];
2512     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2513     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2514     for(m=0;m<nz;m++){
2515       idx   = 7*vi[m];
2516       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2517       x4    = t[3+idx];x5 = t[4+idx];
2518       x6    = t[5+idx];x7 = t[6+idx];
2519       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2520       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2521       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2522       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2523       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2524       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2525       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2526       v += 49;
2527     }
2528     idx = 7*i;
2529     t[idx]   = s1;t[1+idx] = s2;
2530     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2531     t[5+idx] = s6;t[6+idx] = s7;
2532   }
2533   /* backward solve the upper triangular */
2534   for (i=n-1; i>=0; i--){
2535     v    = aa + 49*(adiag[i+1]+1);
2536     vi   = aj + adiag[i+1]+1;
2537     nz   = adiag[i] - adiag[i+1] - 1;
2538     idt  = 7*i;
2539     s1 = t[idt];  s2 = t[1+idt];
2540     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2541     s6 = t[5+idt];s7 = t[6+idt];
2542     for(m=0;m<nz;m++){
2543       idx   = 7*vi[m];
2544       x1    = t[idx];   x2 = t[1+idx];
2545       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2546       x6    = t[5+idx]; x7 = t[6+idx];
2547       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2548       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2549       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2550       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2551       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2552       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2553       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2554       v += 49;
2555     }
2556     idc = 7*c[i];
2557     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2558                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2559     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2560                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2561     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2562                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2563     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2564                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2565     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2566                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2567     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2568                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2569     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2570                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
2571   }
2572 
2573   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2574   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2575   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2576   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2577   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
2578   PetscFunctionReturn(0);
2579 }
2580 
2581 #undef __FUNCT__
2582 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2583 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2584 {
2585   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2586   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2587   PetscErrorCode    ierr;
2588   PetscInt          *diag = a->diag,jdx;
2589   const MatScalar   *aa=a->a,*v;
2590   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2591   const PetscScalar *b;
2592 
2593   PetscFunctionBegin;
2594   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2595   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2596   /* forward solve the lower triangular */
2597   idx    = 0;
2598   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2599   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2600   x[6] = b[6+idx];
2601   for (i=1; i<n; i++) {
2602     v     =  aa + 49*ai[i];
2603     vi    =  aj + ai[i];
2604     nz    =  diag[i] - ai[i];
2605     idx   =  7*i;
2606     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2607     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2608     s7  =  b[6+idx];
2609     while (nz--) {
2610       jdx   = 7*(*vi++);
2611       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2612       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2613       x7    = x[6+jdx];
2614       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2615       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2616       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2617       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2618       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2619       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2620       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2621       v += 49;
2622      }
2623     x[idx]   = s1;
2624     x[1+idx] = s2;
2625     x[2+idx] = s3;
2626     x[3+idx] = s4;
2627     x[4+idx] = s5;
2628     x[5+idx] = s6;
2629     x[6+idx] = s7;
2630   }
2631   /* backward solve the upper triangular */
2632   for (i=n-1; i>=0; i--){
2633     v    = aa + 49*diag[i] + 49;
2634     vi   = aj + diag[i] + 1;
2635     nz   = ai[i+1] - diag[i] - 1;
2636     idt  = 7*i;
2637     s1 = x[idt];   s2 = x[1+idt];
2638     s3 = x[2+idt]; s4 = x[3+idt];
2639     s5 = x[4+idt]; s6 = x[5+idt];
2640     s7 = x[6+idt];
2641     while (nz--) {
2642       idx   = 7*(*vi++);
2643       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
2644       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2645       x7    = x[6+idx];
2646       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2647       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2648       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2649       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2650       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2651       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2652       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
2653       v += 49;
2654     }
2655     v        = aa + 49*diag[i];
2656     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2657                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2658     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2659                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2660     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2661                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2662     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2663                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2664     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2665                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2666     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2667                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2668     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2669                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
2670   }
2671 
2672   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2673   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2674   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2675   PetscFunctionReturn(0);
2676 }
2677 
2678 #undef __FUNCT__
2679 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
2680 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
2681 {
2682     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2683     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2684     PetscErrorCode    ierr;
2685     PetscInt          idx,jdx,idt;
2686     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2687     const MatScalar   *aa=a->a,*v;
2688     PetscScalar       *x;
2689     const PetscScalar *b;
2690     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2691 
2692     PetscFunctionBegin;
2693     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2694     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2695     /* forward solve the lower triangular */
2696     idx    = 0;
2697     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2698     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
2699     for (i=1; i<n; i++) {
2700        v    = aa + bs2*ai[i];
2701        vi   = aj + ai[i];
2702        nz   = ai[i+1] - ai[i];
2703       idx   = bs*i;
2704        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2705        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
2706        for(k=0;k<nz;k++) {
2707           jdx   = bs*vi[k];
2708           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2709 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
2710           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2711           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2712           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2713 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2714           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2715 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2716 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2717           v   +=  bs2;
2718         }
2719 
2720        x[idx]   = s1;
2721        x[1+idx] = s2;
2722        x[2+idx] = s3;
2723        x[3+idx] = s4;
2724        x[4+idx] = s5;
2725        x[5+idx] = s6;
2726        x[6+idx] = s7;
2727     }
2728 
2729    /* backward solve the upper triangular */
2730   for (i=n-1; i>=0; i--){
2731     v   = aa + bs2*(adiag[i+1]+1);
2732      vi  = aj + adiag[i+1]+1;
2733      nz  = adiag[i] - adiag[i+1]-1;
2734      idt = bs*i;
2735      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2736      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
2737     for(k=0;k<nz;k++) {
2738       idx   = bs*vi[k];
2739        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2740        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
2741        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
2742        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
2743        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
2744        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
2745        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
2746        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
2747        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
2748         v   +=  bs2;
2749     }
2750     /* x = inv_diagonal*x */
2751     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
2752     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
2753     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
2754     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
2755     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
2756     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
2757     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
2758   }
2759 
2760   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2761   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2762   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2763   PetscFunctionReturn(0);
2764 }
2765 
2766 #undef __FUNCT__
2767 #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2768 PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
2769 {
2770   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2771   IS                iscol=a->col,isrow=a->row;
2772   PetscErrorCode    ierr;
2773   const PetscInt    *r,*c,*rout,*cout;
2774   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2775   const MatScalar   *aa=a->a,*v;
2776   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2777   const PetscScalar *b;
2778   PetscFunctionBegin;
2779   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2780   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2781   t  = a->solve_work;
2782 
2783   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2784   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2785 
2786   /* forward solve the lower triangular */
2787   idx    = 6*(*r++);
2788   t[0] = b[idx];   t[1] = b[1+idx];
2789   t[2] = b[2+idx]; t[3] = b[3+idx];
2790   t[4] = b[4+idx]; t[5] = b[5+idx];
2791   for (i=1; i<n; i++) {
2792     v     = aa + 36*ai[i];
2793     vi    = aj + ai[i];
2794     nz    = diag[i] - ai[i];
2795     idx   = 6*(*r++);
2796     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2797     s5  = b[4+idx]; s6 = b[5+idx];
2798     while (nz--) {
2799       idx   = 6*(*vi++);
2800       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2801       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2802       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2803       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2804       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2805       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2806       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2807       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2808       v += 36;
2809     }
2810     idx = 6*i;
2811     t[idx]   = s1;t[1+idx] = s2;
2812     t[2+idx] = s3;t[3+idx] = s4;
2813     t[4+idx] = s5;t[5+idx] = s6;
2814   }
2815   /* backward solve the upper triangular */
2816   for (i=n-1; i>=0; i--){
2817     v    = aa + 36*diag[i] + 36;
2818     vi   = aj + diag[i] + 1;
2819     nz   = ai[i+1] - diag[i] - 1;
2820     idt  = 6*i;
2821     s1 = t[idt];  s2 = t[1+idt];
2822     s3 = t[2+idt];s4 = t[3+idt];
2823     s5 = t[4+idt];s6 = t[5+idt];
2824     while (nz--) {
2825       idx   = 6*(*vi++);
2826       x1    = t[idx];   x2 = t[1+idx];
2827       x3    = t[2+idx]; x4 = t[3+idx];
2828       x5    = t[4+idx]; x6 = t[5+idx];
2829       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2830       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2831       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2832       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2833       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2834       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2835       v += 36;
2836     }
2837     idc = 6*(*c--);
2838     v   = aa + 36*diag[i];
2839     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2840                                  v[18]*s4+v[24]*s5+v[30]*s6;
2841     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2842                                  v[19]*s4+v[25]*s5+v[31]*s6;
2843     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2844                                  v[20]*s4+v[26]*s5+v[32]*s6;
2845     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2846                                  v[21]*s4+v[27]*s5+v[33]*s6;
2847     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2848                                  v[22]*s4+v[28]*s5+v[34]*s6;
2849     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2850                                  v[23]*s4+v[29]*s5+v[35]*s6;
2851   }
2852 
2853   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2854   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2855   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2856   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2857   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2858   PetscFunctionReturn(0);
2859 }
2860 
2861 #undef __FUNCT__
2862 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2863 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
2864 {
2865   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2866   IS                iscol=a->col,isrow=a->row;
2867   PetscErrorCode    ierr;
2868   const PetscInt    *r,*c,*rout,*cout;
2869   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2870   const MatScalar   *aa=a->a,*v;
2871   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2872   const PetscScalar *b;
2873   PetscFunctionBegin;
2874   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2875   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2876   t  = a->solve_work;
2877 
2878   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2879   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2880 
2881   /* forward solve the lower triangular */
2882   idx    = 6*r[0];
2883   t[0] = b[idx];   t[1] = b[1+idx];
2884   t[2] = b[2+idx]; t[3] = b[3+idx];
2885   t[4] = b[4+idx]; t[5] = b[5+idx];
2886   for (i=1; i<n; i++) {
2887     v     = aa + 36*ai[i];
2888     vi    = aj + ai[i];
2889     nz    = ai[i+1] - ai[i];
2890     idx   = 6*r[i];
2891     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2892     s5  = b[4+idx]; s6 = b[5+idx];
2893     for(m=0;m<nz;m++){
2894       idx   = 6*vi[m];
2895       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2896       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2897       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2898       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2899       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2900       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2901       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2902       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2903       v += 36;
2904     }
2905     idx = 6*i;
2906     t[idx]   = s1;t[1+idx] = s2;
2907     t[2+idx] = s3;t[3+idx] = s4;
2908     t[4+idx] = s5;t[5+idx] = s6;
2909   }
2910   /* backward solve the upper triangular */
2911   for (i=n-1; i>=0; i--){
2912     v    = aa + 36*(adiag[i+1]+1);
2913     vi   = aj + adiag[i+1]+1;
2914     nz   = adiag[i] - adiag[i+1] - 1;
2915     idt  = 6*i;
2916     s1 = t[idt];  s2 = t[1+idt];
2917     s3 = t[2+idt];s4 = t[3+idt];
2918     s5 = t[4+idt];s6 = t[5+idt];
2919     for(m=0;m<nz;m++){
2920       idx   = 6*vi[m];
2921       x1    = t[idx];   x2 = t[1+idx];
2922       x3    = t[2+idx]; x4 = t[3+idx];
2923       x5    = t[4+idx]; x6 = t[5+idx];
2924       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2925       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2926       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2927       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2928       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2929       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2930       v += 36;
2931     }
2932     idc = 6*c[i];
2933     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2934                                  v[18]*s4+v[24]*s5+v[30]*s6;
2935     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2936                                  v[19]*s4+v[25]*s5+v[31]*s6;
2937     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2938                                  v[20]*s4+v[26]*s5+v[32]*s6;
2939     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2940                                  v[21]*s4+v[27]*s5+v[33]*s6;
2941     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2942                                  v[22]*s4+v[28]*s5+v[34]*s6;
2943     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2944                                  v[23]*s4+v[29]*s5+v[35]*s6;
2945   }
2946 
2947   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2948   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2949   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2950   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2951   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
2952   PetscFunctionReturn(0);
2953 }
2954 
2955 #undef __FUNCT__
2956 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
2957 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
2958 {
2959   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2960   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2961   PetscErrorCode    ierr;
2962   PetscInt          *diag = a->diag,jdx;
2963   const MatScalar   *aa=a->a,*v;
2964   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2965   const PetscScalar *b;
2966 
2967   PetscFunctionBegin;
2968   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2969   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2970   /* forward solve the lower triangular */
2971   idx    = 0;
2972   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
2973   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
2974   for (i=1; i<n; i++) {
2975     v     =  aa + 36*ai[i];
2976     vi    =  aj + ai[i];
2977     nz    =  diag[i] - ai[i];
2978     idx   =  6*i;
2979     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2980     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2981     while (nz--) {
2982       jdx   = 6*(*vi++);
2983       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
2984       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2985       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2986       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2987       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2988       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2989       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2990       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
2991       v += 36;
2992      }
2993     x[idx]   = s1;
2994     x[1+idx] = s2;
2995     x[2+idx] = s3;
2996     x[3+idx] = s4;
2997     x[4+idx] = s5;
2998     x[5+idx] = s6;
2999   }
3000   /* backward solve the upper triangular */
3001   for (i=n-1; i>=0; i--){
3002     v    = aa + 36*diag[i] + 36;
3003     vi   = aj + diag[i] + 1;
3004     nz   = ai[i+1] - diag[i] - 1;
3005     idt  = 6*i;
3006     s1 = x[idt];   s2 = x[1+idt];
3007     s3 = x[2+idt]; s4 = x[3+idt];
3008     s5 = x[4+idt]; s6 = x[5+idt];
3009     while (nz--) {
3010       idx   = 6*(*vi++);
3011       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3012       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3013       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3014       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3015       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3016       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3017       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3018       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
3019       v += 36;
3020     }
3021     v        = aa + 36*diag[i];
3022     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3023     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3024     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3025     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3026     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3027     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3028   }
3029 
3030   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3031   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3032   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
3033   PetscFunctionReturn(0);
3034 }
3035 
3036 #undef __FUNCT__
3037 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
3038 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
3039 {
3040     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3041     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3042     PetscErrorCode    ierr;
3043     PetscInt          idx,jdx,idt;
3044     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3045     const MatScalar   *aa=a->a,*v;
3046     PetscScalar       *x;
3047     const PetscScalar *b;
3048     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3049 
3050     PetscFunctionBegin;
3051     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3052     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3053     /* forward solve the lower triangular */
3054     idx    = 0;
3055     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3056     x[4] = b[4+idx];x[5] = b[5+idx];
3057     for (i=1; i<n; i++) {
3058        v    = aa + bs2*ai[i];
3059        vi   = aj + ai[i];
3060        nz   = ai[i+1] - ai[i];
3061       idx   = bs*i;
3062        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3063        s5   = b[4+idx];s6 = b[5+idx];
3064        for(k=0;k<nz;k++){
3065           jdx   = bs*vi[k];
3066           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3067 	  x5    = x[4+jdx]; x6 = x[5+jdx];
3068           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3069           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3070           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3071 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3072           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3073 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3074           v   +=  bs2;
3075         }
3076 
3077        x[idx]   = s1;
3078        x[1+idx] = s2;
3079        x[2+idx] = s3;
3080        x[3+idx] = s4;
3081        x[4+idx] = s5;
3082        x[5+idx] = s6;
3083     }
3084 
3085    /* backward solve the upper triangular */
3086   for (i=n-1; i>=0; i--){
3087     v   = aa + bs2*(adiag[i+1]+1);
3088      vi  = aj + adiag[i+1]+1;
3089      nz  = adiag[i] - adiag[i+1]-1;
3090      idt = bs*i;
3091      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3092      s5 = x[4+idt];s6 = x[5+idt];
3093      for(k=0;k<nz;k++){
3094       idx   = bs*vi[k];
3095        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3096        x5    = x[4+idx];x6 = x[5+idx];
3097        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
3098        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
3099        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
3100        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
3101        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
3102        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
3103         v   +=  bs2;
3104     }
3105     /* x = inv_diagonal*x */
3106    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3107    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3108    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3109    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3110    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3111    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
3112   }
3113 
3114   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3115   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3116   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3117   PetscFunctionReturn(0);
3118 }
3119 
3120 #undef __FUNCT__
3121 #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
3122 PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
3123 {
3124   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3125   IS                iscol=a->col,isrow=a->row;
3126   PetscErrorCode    ierr;
3127   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3128   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3129   const MatScalar   *aa=a->a,*v;
3130   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3131   const PetscScalar *b;
3132 
3133   PetscFunctionBegin;
3134   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3135   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3136   t  = a->solve_work;
3137 
3138   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3139   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3140 
3141   /* forward solve the lower triangular */
3142   idx    = 5*(*r++);
3143   t[0] = b[idx];   t[1] = b[1+idx];
3144   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3145   for (i=1; i<n; i++) {
3146     v     = aa + 25*ai[i];
3147     vi    = aj + ai[i];
3148     nz    = diag[i] - ai[i];
3149     idx   = 5*(*r++);
3150     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3151     s5  = b[4+idx];
3152     while (nz--) {
3153       idx   = 5*(*vi++);
3154       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3155       x4    = t[3+idx];x5 = t[4+idx];
3156       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3157       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3158       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3159       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3160       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3161       v += 25;
3162     }
3163     idx = 5*i;
3164     t[idx]   = s1;t[1+idx] = s2;
3165     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3166   }
3167   /* backward solve the upper triangular */
3168   for (i=n-1; i>=0; i--){
3169     v    = aa + 25*diag[i] + 25;
3170     vi   = aj + diag[i] + 1;
3171     nz   = ai[i+1] - diag[i] - 1;
3172     idt  = 5*i;
3173     s1 = t[idt];  s2 = t[1+idt];
3174     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3175     while (nz--) {
3176       idx   = 5*(*vi++);
3177       x1    = t[idx];   x2 = t[1+idx];
3178       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3179       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3180       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3181       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3182       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3183       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3184       v += 25;
3185     }
3186     idc = 5*(*c--);
3187     v   = aa + 25*diag[i];
3188     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3189                                  v[15]*s4+v[20]*s5;
3190     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3191                                  v[16]*s4+v[21]*s5;
3192     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3193                                  v[17]*s4+v[22]*s5;
3194     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3195                                  v[18]*s4+v[23]*s5;
3196     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3197                                  v[19]*s4+v[24]*s5;
3198   }
3199 
3200   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3201   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3202   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3203   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3204   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3205   PetscFunctionReturn(0);
3206 }
3207 
3208 #undef __FUNCT__
3209 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
3210 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
3211 {
3212   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3213   IS                iscol=a->col,isrow=a->row;
3214   PetscErrorCode    ierr;
3215   const PetscInt    *r,*c,*rout,*cout;
3216   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3217   const MatScalar   *aa=a->a,*v;
3218   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3219   const PetscScalar *b;
3220 
3221   PetscFunctionBegin;
3222   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3223   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3224   t  = a->solve_work;
3225 
3226   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3227   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3228 
3229   /* forward solve the lower triangular */
3230   idx    = 5*r[0];
3231   t[0] = b[idx];   t[1] = b[1+idx];
3232   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
3233   for (i=1; i<n; i++) {
3234     v     = aa + 25*ai[i];
3235     vi    = aj + ai[i];
3236     nz    = ai[i+1] - ai[i];
3237     idx   = 5*r[i];
3238     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3239     s5  = b[4+idx];
3240     for(m=0;m<nz;m++){
3241       idx   = 5*vi[m];
3242       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3243       x4    = t[3+idx];x5 = t[4+idx];
3244       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3245       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3246       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3247       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3248       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3249       v += 25;
3250     }
3251     idx = 5*i;
3252     t[idx]   = s1;t[1+idx] = s2;
3253     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
3254   }
3255   /* backward solve the upper triangular */
3256   for (i=n-1; i>=0; i--){
3257     v    = aa + 25*(adiag[i+1]+1);
3258     vi   = aj + adiag[i+1]+1;
3259     nz   = adiag[i] - adiag[i+1] - 1;
3260     idt  = 5*i;
3261     s1 = t[idt];  s2 = t[1+idt];
3262     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
3263     for(m=0;m<nz;m++){
3264       idx   = 5*vi[m];
3265       x1    = t[idx];   x2 = t[1+idx];
3266       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3267       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3268       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3269       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3270       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3271       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
3272       v += 25;
3273     }
3274     idc = 5*c[i];
3275     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3276                                  v[15]*s4+v[20]*s5;
3277     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3278                                  v[16]*s4+v[21]*s5;
3279     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3280                                  v[17]*s4+v[22]*s5;
3281     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3282                                  v[18]*s4+v[23]*s5;
3283     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3284                                  v[19]*s4+v[24]*s5;
3285   }
3286 
3287   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3288   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3289   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3290   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3291   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3292   PetscFunctionReturn(0);
3293 }
3294 
3295 #undef __FUNCT__
3296 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3297 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3298 {
3299   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3300   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3301   PetscErrorCode    ierr;
3302   PetscInt          *diag = a->diag,jdx;
3303   const MatScalar   *aa=a->a,*v;
3304   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3305   const PetscScalar *b;
3306 
3307   PetscFunctionBegin;
3308   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3309   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3310   /* forward solve the lower triangular */
3311   idx    = 0;
3312   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3313   for (i=1; i<n; i++) {
3314     v     =  aa + 25*ai[i];
3315     vi    =  aj + ai[i];
3316     nz    =  diag[i] - ai[i];
3317     idx   =  5*i;
3318     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3319     while (nz--) {
3320       jdx   = 5*(*vi++);
3321       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3322       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3323       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3324       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3325       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3326       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3327       v    += 25;
3328     }
3329     x[idx]   = s1;
3330     x[1+idx] = s2;
3331     x[2+idx] = s3;
3332     x[3+idx] = s4;
3333     x[4+idx] = s5;
3334   }
3335   /* backward solve the upper triangular */
3336   for (i=n-1; i>=0; i--){
3337     v    = aa + 25*diag[i] + 25;
3338     vi   = aj + diag[i] + 1;
3339     nz   = ai[i+1] - diag[i] - 1;
3340     idt  = 5*i;
3341     s1 = x[idt];  s2 = x[1+idt];
3342     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3343     while (nz--) {
3344       idx   = 5*(*vi++);
3345       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3346       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3347       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3348       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3349       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3350       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3351       v    += 25;
3352     }
3353     v        = aa + 25*diag[i];
3354     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3355     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3356     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3357     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3358     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3359   }
3360 
3361   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3362   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3363   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3364   PetscFunctionReturn(0);
3365 }
3366 
3367 #undef __FUNCT__
3368 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
3369 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
3370 {
3371   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3372   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
3373   PetscErrorCode    ierr;
3374   PetscInt          jdx;
3375   const MatScalar   *aa=a->a,*v;
3376   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3377   const PetscScalar *b;
3378 
3379   PetscFunctionBegin;
3380   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3381   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3382   /* forward solve the lower triangular */
3383   idx    = 0;
3384   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
3385   for (i=1; i<n; i++) {
3386     v   = aa + 25*ai[i];
3387     vi  = aj + ai[i];
3388     nz  = ai[i+1] - ai[i];
3389     idx = 5*i;
3390     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
3391     for(k=0;k<nz;k++) {
3392       jdx   = 5*vi[k];
3393       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3394       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3395       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3396       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3397       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3398       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3399       v    += 25;
3400     }
3401     x[idx]   = s1;
3402     x[1+idx] = s2;
3403     x[2+idx] = s3;
3404     x[3+idx] = s4;
3405     x[4+idx] = s5;
3406   }
3407 
3408   /* backward solve the upper triangular */
3409   for (i=n-1; i>=0; i--){
3410     v   = aa + 25*(adiag[i+1]+1);
3411     vi  = aj + adiag[i+1]+1;
3412     nz  = adiag[i] - adiag[i+1]-1;
3413     idt = 5*i;
3414     s1 = x[idt];  s2 = x[1+idt];
3415     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
3416     for(k=0;k<nz;k++){
3417       idx   = 5*vi[k];
3418       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3419       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3420       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3421       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3422       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3423       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
3424       v    += 25;
3425     }
3426     /* x = inv_diagonal*x */
3427     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3428     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3429     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3430     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3431     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
3432   }
3433 
3434   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3435   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3436   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
3437   PetscFunctionReturn(0);
3438 }
3439 
3440 #undef __FUNCT__
3441 #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3442 PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
3443 {
3444   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3445   IS                iscol=a->col,isrow=a->row;
3446   PetscErrorCode    ierr;
3447   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3448   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3449   const MatScalar   *aa=a->a,*v;
3450   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3451   const PetscScalar *b;
3452 
3453   PetscFunctionBegin;
3454   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3455   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3456   t  = a->solve_work;
3457 
3458   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3459   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3460 
3461   /* forward solve the lower triangular */
3462   idx    = 4*(*r++);
3463   t[0] = b[idx];   t[1] = b[1+idx];
3464   t[2] = b[2+idx]; t[3] = b[3+idx];
3465   for (i=1; i<n; i++) {
3466     v     = aa + 16*ai[i];
3467     vi    = aj + ai[i];
3468     nz    = diag[i] - ai[i];
3469     idx   = 4*(*r++);
3470     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3471     while (nz--) {
3472       idx   = 4*(*vi++);
3473       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3474       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3475       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3476       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3477       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3478       v    += 16;
3479     }
3480     idx        = 4*i;
3481     t[idx]   = s1;t[1+idx] = s2;
3482     t[2+idx] = s3;t[3+idx] = s4;
3483   }
3484   /* backward solve the upper triangular */
3485   for (i=n-1; i>=0; i--){
3486     v    = aa + 16*diag[i] + 16;
3487     vi   = aj + diag[i] + 1;
3488     nz   = ai[i+1] - diag[i] - 1;
3489     idt  = 4*i;
3490     s1 = t[idt];  s2 = t[1+idt];
3491     s3 = t[2+idt];s4 = t[3+idt];
3492     while (nz--) {
3493       idx   = 4*(*vi++);
3494       x1    = t[idx];   x2 = t[1+idx];
3495       x3    = t[2+idx]; x4 = t[3+idx];
3496       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3497       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3498       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3499       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3500       v += 16;
3501     }
3502     idc      = 4*(*c--);
3503     v        = aa + 16*diag[i];
3504     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3505     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3506     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3507     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3508   }
3509 
3510   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3511   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3512   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3513   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3514   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3515   PetscFunctionReturn(0);
3516 }
3517 
3518 #undef __FUNCT__
3519 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
3520 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
3521 {
3522   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3523   IS                iscol=a->col,isrow=a->row;
3524   PetscErrorCode    ierr;
3525   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3526   const PetscInt    *r,*c,*rout,*cout;
3527   const MatScalar   *aa=a->a,*v;
3528   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3529   const PetscScalar *b;
3530 
3531   PetscFunctionBegin;
3532   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3533   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3534   t  = a->solve_work;
3535 
3536   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3537   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3538 
3539   /* forward solve the lower triangular */
3540   idx    = 4*r[0];
3541   t[0] = b[idx];   t[1] = b[1+idx];
3542   t[2] = b[2+idx]; t[3] = b[3+idx];
3543   for (i=1; i<n; i++) {
3544     v     = aa + 16*ai[i];
3545     vi    = aj + ai[i];
3546     nz    = ai[i+1] - ai[i];
3547     idx   = 4*r[i];
3548     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3549     for(m=0;m<nz;m++){
3550       idx   = 4*vi[m];
3551       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3552       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3553       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3554       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3555       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3556       v    += 16;
3557     }
3558     idx        = 4*i;
3559     t[idx]   = s1;t[1+idx] = s2;
3560     t[2+idx] = s3;t[3+idx] = s4;
3561   }
3562   /* backward solve the upper triangular */
3563   for (i=n-1; i>=0; i--){
3564     v    = aa + 16*(adiag[i+1]+1);
3565     vi   = aj + adiag[i+1]+1;
3566     nz   = adiag[i] - adiag[i+1] - 1;
3567     idt  = 4*i;
3568     s1 = t[idt];  s2 = t[1+idt];
3569     s3 = t[2+idt];s4 = t[3+idt];
3570     for(m=0;m<nz;m++){
3571       idx   = 4*vi[m];
3572       x1    = t[idx];   x2 = t[1+idx];
3573       x3    = t[2+idx]; x4 = t[3+idx];
3574       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3575       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3576       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3577       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3578       v += 16;
3579     }
3580     idc      = 4*c[i];
3581     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3582     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3583     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3584     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3585   }
3586 
3587   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3588   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3589   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3590   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3591   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3592   PetscFunctionReturn(0);
3593 }
3594 
3595 #undef __FUNCT__
3596 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3597 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3598 {
3599   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3600   IS                iscol=a->col,isrow=a->row;
3601   PetscErrorCode    ierr;
3602   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3603   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3604   const MatScalar   *aa=a->a,*v;
3605   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3606   PetscScalar       *x;
3607   const PetscScalar *b;
3608 
3609   PetscFunctionBegin;
3610   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3611   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3612   t  = (MatScalar *)a->solve_work;
3613 
3614   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3615   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3616 
3617   /* forward solve the lower triangular */
3618   idx    = 4*(*r++);
3619   t[0] = (MatScalar)b[idx];
3620   t[1] = (MatScalar)b[1+idx];
3621   t[2] = (MatScalar)b[2+idx];
3622   t[3] = (MatScalar)b[3+idx];
3623   for (i=1; i<n; i++) {
3624     v     = aa + 16*ai[i];
3625     vi    = aj + ai[i];
3626     nz    = diag[i] - ai[i];
3627     idx   = 4*(*r++);
3628     s1 = (MatScalar)b[idx];
3629     s2 = (MatScalar)b[1+idx];
3630     s3 = (MatScalar)b[2+idx];
3631     s4 = (MatScalar)b[3+idx];
3632     while (nz--) {
3633       idx   = 4*(*vi++);
3634       x1  = t[idx];
3635       x2  = t[1+idx];
3636       x3  = t[2+idx];
3637       x4  = t[3+idx];
3638       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3639       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3640       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3641       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3642       v    += 16;
3643     }
3644     idx        = 4*i;
3645     t[idx]   = s1;
3646     t[1+idx] = s2;
3647     t[2+idx] = s3;
3648     t[3+idx] = s4;
3649   }
3650   /* backward solve the upper triangular */
3651   for (i=n-1; i>=0; i--){
3652     v    = aa + 16*diag[i] + 16;
3653     vi   = aj + diag[i] + 1;
3654     nz   = ai[i+1] - diag[i] - 1;
3655     idt  = 4*i;
3656     s1 = t[idt];
3657     s2 = t[1+idt];
3658     s3 = t[2+idt];
3659     s4 = t[3+idt];
3660     while (nz--) {
3661       idx   = 4*(*vi++);
3662       x1  = t[idx];
3663       x2  = t[1+idx];
3664       x3  = t[2+idx];
3665       x4  = t[3+idx];
3666       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3667       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3668       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3669       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3670       v += 16;
3671     }
3672     idc      = 4*(*c--);
3673     v        = aa + 16*diag[i];
3674     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3675     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3676     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3677     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3678     x[idc]   = (PetscScalar)t[idt];
3679     x[1+idc] = (PetscScalar)t[1+idt];
3680     x[2+idc] = (PetscScalar)t[2+idt];
3681     x[3+idc] = (PetscScalar)t[3+idt];
3682  }
3683 
3684   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3685   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3686   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3687   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3688   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3689   PetscFunctionReturn(0);
3690 }
3691 
3692 #if defined (PETSC_HAVE_SSE)
3693 
3694 #include PETSC_HAVE_SSE
3695 
3696 #undef __FUNCT__
3697 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3698 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
3699 {
3700   /*
3701      Note: This code uses demotion of double
3702      to float when performing the mixed-mode computation.
3703      This may not be numerically reasonable for all applications.
3704   */
3705   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3706   IS             iscol=a->col,isrow=a->row;
3707   PetscErrorCode ierr;
3708   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
3709   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3710   MatScalar      *aa=a->a,*v;
3711   PetscScalar    *x,*b,*t;
3712 
3713   /* Make space in temp stack for 16 Byte Aligned arrays */
3714   float           ssealignedspace[11],*tmps,*tmpx;
3715   unsigned long   offset;
3716 
3717   PetscFunctionBegin;
3718   SSE_SCOPE_BEGIN;
3719 
3720     offset = (unsigned long)ssealignedspace % 16;
3721     if (offset) offset = (16 - offset)/4;
3722     tmps = &ssealignedspace[offset];
3723     tmpx = &ssealignedspace[offset+4];
3724     PREFETCH_NTA(aa+16*ai[1]);
3725 
3726     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3727     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3728     t  = a->solve_work;
3729 
3730     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3731     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3732 
3733     /* forward solve the lower triangular */
3734     idx  = 4*(*r++);
3735     t[0] = b[idx];   t[1] = b[1+idx];
3736     t[2] = b[2+idx]; t[3] = b[3+idx];
3737     v    =  aa + 16*ai[1];
3738 
3739     for (i=1; i<n;) {
3740       PREFETCH_NTA(&v[8]);
3741       vi   =  aj      + ai[i];
3742       nz   =  diag[i] - ai[i];
3743       idx  =  4*(*r++);
3744 
3745       /* Demote sum from double to float */
3746       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
3747       LOAD_PS(tmps,XMM7);
3748 
3749       while (nz--) {
3750         PREFETCH_NTA(&v[16]);
3751         idx = 4*(*vi++);
3752 
3753         /* Demote solution (so far) from double to float */
3754         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
3755 
3756         /* 4x4 Matrix-Vector product with negative accumulation: */
3757         SSE_INLINE_BEGIN_2(tmpx,v)
3758           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3759 
3760           /* First Column */
3761           SSE_COPY_PS(XMM0,XMM6)
3762           SSE_SHUFFLE(XMM0,XMM0,0x00)
3763           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3764           SSE_SUB_PS(XMM7,XMM0)
3765 
3766           /* Second Column */
3767           SSE_COPY_PS(XMM1,XMM6)
3768           SSE_SHUFFLE(XMM1,XMM1,0x55)
3769           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3770           SSE_SUB_PS(XMM7,XMM1)
3771 
3772           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3773 
3774           /* Third Column */
3775           SSE_COPY_PS(XMM2,XMM6)
3776           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3777           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3778           SSE_SUB_PS(XMM7,XMM2)
3779 
3780           /* Fourth Column */
3781           SSE_COPY_PS(XMM3,XMM6)
3782           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3783           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3784           SSE_SUB_PS(XMM7,XMM3)
3785         SSE_INLINE_END_2
3786 
3787         v  += 16;
3788       }
3789       idx = 4*i;
3790       v   = aa + 16*ai[++i];
3791       PREFETCH_NTA(v);
3792       STORE_PS(tmps,XMM7);
3793 
3794       /* Promote result from float to double */
3795       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
3796     }
3797     /* backward solve the upper triangular */
3798     idt  = 4*(n-1);
3799     ai16 = 16*diag[n-1];
3800     v    = aa + ai16 + 16;
3801     for (i=n-1; i>=0;){
3802       PREFETCH_NTA(&v[8]);
3803       vi = aj + diag[i] + 1;
3804       nz = ai[i+1] - diag[i] - 1;
3805 
3806       /* Demote accumulator from double to float */
3807       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
3808       LOAD_PS(tmps,XMM7);
3809 
3810       while (nz--) {
3811         PREFETCH_NTA(&v[16]);
3812         idx = 4*(*vi++);
3813 
3814         /* Demote solution (so far) from double to float */
3815         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
3816 
3817         /* 4x4 Matrix-Vector Product with negative accumulation: */
3818         SSE_INLINE_BEGIN_2(tmpx,v)
3819           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
3820 
3821           /* First Column */
3822           SSE_COPY_PS(XMM0,XMM6)
3823           SSE_SHUFFLE(XMM0,XMM0,0x00)
3824           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
3825           SSE_SUB_PS(XMM7,XMM0)
3826 
3827           /* Second Column */
3828           SSE_COPY_PS(XMM1,XMM6)
3829           SSE_SHUFFLE(XMM1,XMM1,0x55)
3830           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
3831           SSE_SUB_PS(XMM7,XMM1)
3832 
3833           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
3834 
3835           /* Third Column */
3836           SSE_COPY_PS(XMM2,XMM6)
3837           SSE_SHUFFLE(XMM2,XMM2,0xAA)
3838           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
3839           SSE_SUB_PS(XMM7,XMM2)
3840 
3841           /* Fourth Column */
3842           SSE_COPY_PS(XMM3,XMM6)
3843           SSE_SHUFFLE(XMM3,XMM3,0xFF)
3844           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
3845           SSE_SUB_PS(XMM7,XMM3)
3846         SSE_INLINE_END_2
3847         v  += 16;
3848       }
3849       v    = aa + ai16;
3850       ai16 = 16*diag[--i];
3851       PREFETCH_NTA(aa+ai16+16);
3852       /*
3853          Scale the result by the diagonal 4x4 block,
3854          which was inverted as part of the factorization
3855       */
3856       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
3857         /* First Column */
3858         SSE_COPY_PS(XMM0,XMM7)
3859         SSE_SHUFFLE(XMM0,XMM0,0x00)
3860         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
3861 
3862         /* Second Column */
3863         SSE_COPY_PS(XMM1,XMM7)
3864         SSE_SHUFFLE(XMM1,XMM1,0x55)
3865         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
3866         SSE_ADD_PS(XMM0,XMM1)
3867 
3868         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
3869 
3870         /* Third Column */
3871         SSE_COPY_PS(XMM2,XMM7)
3872         SSE_SHUFFLE(XMM2,XMM2,0xAA)
3873         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
3874         SSE_ADD_PS(XMM0,XMM2)
3875 
3876         /* Fourth Column */
3877         SSE_COPY_PS(XMM3,XMM7)
3878         SSE_SHUFFLE(XMM3,XMM3,0xFF)
3879         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
3880         SSE_ADD_PS(XMM0,XMM3)
3881 
3882         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
3883       SSE_INLINE_END_3
3884 
3885       /* Promote solution from float to double */
3886       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
3887 
3888       /* Apply reordering to t and stream into x.    */
3889       /* This way, x doesn't pollute the cache.      */
3890       /* Be careful with size: 2 doubles = 4 floats! */
3891       idc  = 4*(*c--);
3892       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
3893         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
3894         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
3895         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
3896         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
3897         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
3898         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
3899       SSE_INLINE_END_2
3900       v    = aa + ai16 + 16;
3901       idt -= 4;
3902     }
3903 
3904     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3905     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3906     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3907     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3908     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3909   SSE_SCOPE_END;
3910   PetscFunctionReturn(0);
3911 }
3912 
3913 #endif
3914 
3915 
3916 /*
3917       Special case where the matrix was ILU(0) factored in the natural
3918    ordering. This eliminates the need for the column and row permutation.
3919 */
3920 #undef __FUNCT__
3921 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
3922 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
3923 {
3924   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3925   PetscInt          n=a->mbs;
3926   const PetscInt    *ai=a->i,*aj=a->j;
3927   PetscErrorCode    ierr;
3928   const PetscInt    *diag = a->diag;
3929   const MatScalar   *aa=a->a;
3930   PetscScalar       *x;
3931   const PetscScalar *b;
3932 
3933   PetscFunctionBegin;
3934   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3935   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3936 
3937 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
3938   {
3939     static PetscScalar w[2000]; /* very BAD need to fix */
3940     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
3941   }
3942 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
3943   {
3944     static PetscScalar w[2000]; /* very BAD need to fix */
3945     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
3946   }
3947 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
3948   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3949 #else
3950   {
3951     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3952     const MatScalar *v;
3953     PetscInt        jdx,idt,idx,nz,i,ai16;
3954     const PetscInt  *vi;
3955 
3956   /* forward solve the lower triangular */
3957   idx    = 0;
3958   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
3959   for (i=1; i<n; i++) {
3960     v     =  aa      + 16*ai[i];
3961     vi    =  aj      + ai[i];
3962     nz    =  diag[i] - ai[i];
3963     idx   +=  4;
3964     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3965     while (nz--) {
3966       jdx   = 4*(*vi++);
3967       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3968       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3969       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3970       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3971       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3972       v    += 16;
3973     }
3974     x[idx]   = s1;
3975     x[1+idx] = s2;
3976     x[2+idx] = s3;
3977     x[3+idx] = s4;
3978   }
3979   /* backward solve the upper triangular */
3980   idt = 4*(n-1);
3981   for (i=n-1; i>=0; i--){
3982     ai16 = 16*diag[i];
3983     v    = aa + ai16 + 16;
3984     vi   = aj + diag[i] + 1;
3985     nz   = ai[i+1] - diag[i] - 1;
3986     s1 = x[idt];  s2 = x[1+idt];
3987     s3 = x[2+idt];s4 = x[3+idt];
3988     while (nz--) {
3989       idx   = 4*(*vi++);
3990       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3991       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3992       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3993       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3994       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3995       v    += 16;
3996     }
3997     v        = aa + ai16;
3998     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3999     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4000     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4001     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4002     idt -= 4;
4003   }
4004   }
4005 #endif
4006 
4007   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4008   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4009   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4010   PetscFunctionReturn(0);
4011 }
4012 
4013 #undef __FUNCT__
4014 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
4015 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4016 {
4017     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4018     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4019     PetscErrorCode    ierr;
4020     PetscInt          idx,jdx,idt;
4021     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4022     const MatScalar   *aa=a->a,*v;
4023     PetscScalar       *x;
4024     const PetscScalar *b;
4025     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4026 
4027     PetscFunctionBegin;
4028     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4029     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4030     /* forward solve the lower triangular */
4031     idx    = 0;
4032     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4033     for (i=1; i<n; i++) {
4034        v    = aa + bs2*ai[i];
4035        vi   = aj + ai[i];
4036        nz   = ai[i+1] - ai[i];
4037       idx   = bs*i;
4038        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4039       for(k=0;k<nz;k++) {
4040           jdx   = bs*vi[k];
4041           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4042           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4043           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4044           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4045 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4046 
4047           v   +=  bs2;
4048         }
4049 
4050        x[idx]   = s1;
4051        x[1+idx] = s2;
4052        x[2+idx] = s3;
4053        x[3+idx] = s4;
4054     }
4055 
4056    /* backward solve the upper triangular */
4057   for (i=n-1; i>=0; i--){
4058     v   = aa + bs2*(adiag[i+1]+1);
4059      vi  = aj + adiag[i+1]+1;
4060      nz  = adiag[i] - adiag[i+1]-1;
4061      idt = bs*i;
4062      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4063 
4064     for(k=0;k<nz;k++){
4065       idx   = bs*vi[k];
4066        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4067        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4068        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4069        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4070        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4071 
4072         v   +=  bs2;
4073     }
4074     /* x = inv_diagonal*x */
4075    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4076    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4077    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4078    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4079 
4080   }
4081 
4082   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4083   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4084   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4085   PetscFunctionReturn(0);
4086 }
4087 
4088 #undef __FUNCT__
4089 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4090 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4091 {
4092   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4093   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4094   PetscErrorCode ierr;
4095   PetscInt       *diag = a->diag;
4096   MatScalar      *aa=a->a;
4097   PetscScalar    *x,*b;
4098 
4099   PetscFunctionBegin;
4100   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4101   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4102 
4103   {
4104     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
4105     MatScalar  *v,*t=(MatScalar *)x;
4106     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
4107 
4108     /* forward solve the lower triangular */
4109     idx  = 0;
4110     t[0] = (MatScalar)b[0];
4111     t[1] = (MatScalar)b[1];
4112     t[2] = (MatScalar)b[2];
4113     t[3] = (MatScalar)b[3];
4114     for (i=1; i<n; i++) {
4115       v     =  aa      + 16*ai[i];
4116       vi    =  aj      + ai[i];
4117       nz    =  diag[i] - ai[i];
4118       idx   +=  4;
4119       s1 = (MatScalar)b[idx];
4120       s2 = (MatScalar)b[1+idx];
4121       s3 = (MatScalar)b[2+idx];
4122       s4 = (MatScalar)b[3+idx];
4123       while (nz--) {
4124         jdx = 4*(*vi++);
4125         x1  = t[jdx];
4126         x2  = t[1+jdx];
4127         x3  = t[2+jdx];
4128         x4  = t[3+jdx];
4129         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4130         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4131         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4132         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4133         v    += 16;
4134       }
4135       t[idx]   = s1;
4136       t[1+idx] = s2;
4137       t[2+idx] = s3;
4138       t[3+idx] = s4;
4139     }
4140     /* backward solve the upper triangular */
4141     idt = 4*(n-1);
4142     for (i=n-1; i>=0; i--){
4143       ai16 = 16*diag[i];
4144       v    = aa + ai16 + 16;
4145       vi   = aj + diag[i] + 1;
4146       nz   = ai[i+1] - diag[i] - 1;
4147       s1   = t[idt];
4148       s2   = t[1+idt];
4149       s3   = t[2+idt];
4150       s4   = t[3+idt];
4151       while (nz--) {
4152         idx = 4*(*vi++);
4153         x1  = (MatScalar)x[idx];
4154         x2  = (MatScalar)x[1+idx];
4155         x3  = (MatScalar)x[2+idx];
4156         x4  = (MatScalar)x[3+idx];
4157         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4158         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4159         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4160         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4161         v    += 16;
4162       }
4163       v        = aa + ai16;
4164       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4165       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4166       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4167       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4168       idt -= 4;
4169     }
4170   }
4171 
4172   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4173   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4174   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4175   PetscFunctionReturn(0);
4176 }
4177 
4178 #if defined (PETSC_HAVE_SSE)
4179 
4180 #include PETSC_HAVE_SSE
4181 #undef __FUNCT__
4182 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4183 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
4184 {
4185   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4186   unsigned short *aj=(unsigned short *)a->j;
4187   PetscErrorCode ierr;
4188   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4189   MatScalar      *aa=a->a;
4190   PetscScalar    *x,*b;
4191 
4192   PetscFunctionBegin;
4193   SSE_SCOPE_BEGIN;
4194   /*
4195      Note: This code currently uses demotion of double
4196      to float when performing the mixed-mode computation.
4197      This may not be numerically reasonable for all applications.
4198   */
4199   PREFETCH_NTA(aa+16*ai[1]);
4200 
4201   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4202   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4203   {
4204     /* x will first be computed in single precision then promoted inplace to double */
4205     MatScalar      *v,*t=(MatScalar *)x;
4206     int            nz,i,idt,ai16;
4207     unsigned int   jdx,idx;
4208     unsigned short *vi;
4209     /* Forward solve the lower triangular factor. */
4210 
4211     /* First block is the identity. */
4212     idx  = 0;
4213     CONVERT_DOUBLE4_FLOAT4(t,b);
4214     v    =  aa + 16*((unsigned int)ai[1]);
4215 
4216     for (i=1; i<n;) {
4217       PREFETCH_NTA(&v[8]);
4218       vi   =  aj      + ai[i];
4219       nz   =  diag[i] - ai[i];
4220       idx +=  4;
4221 
4222       /* Demote RHS from double to float. */
4223       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4224       LOAD_PS(&t[idx],XMM7);
4225 
4226       while (nz--) {
4227         PREFETCH_NTA(&v[16]);
4228         jdx = 4*((unsigned int)(*vi++));
4229 
4230         /* 4x4 Matrix-Vector product with negative accumulation: */
4231         SSE_INLINE_BEGIN_2(&t[jdx],v)
4232           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4233 
4234           /* First Column */
4235           SSE_COPY_PS(XMM0,XMM6)
4236           SSE_SHUFFLE(XMM0,XMM0,0x00)
4237           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4238           SSE_SUB_PS(XMM7,XMM0)
4239 
4240           /* Second Column */
4241           SSE_COPY_PS(XMM1,XMM6)
4242           SSE_SHUFFLE(XMM1,XMM1,0x55)
4243           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4244           SSE_SUB_PS(XMM7,XMM1)
4245 
4246           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4247 
4248           /* Third Column */
4249           SSE_COPY_PS(XMM2,XMM6)
4250           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4251           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4252           SSE_SUB_PS(XMM7,XMM2)
4253 
4254           /* Fourth Column */
4255           SSE_COPY_PS(XMM3,XMM6)
4256           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4257           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4258           SSE_SUB_PS(XMM7,XMM3)
4259         SSE_INLINE_END_2
4260 
4261         v  += 16;
4262       }
4263       v    =  aa + 16*ai[++i];
4264       PREFETCH_NTA(v);
4265       STORE_PS(&t[idx],XMM7);
4266     }
4267 
4268     /* Backward solve the upper triangular factor.*/
4269 
4270     idt  = 4*(n-1);
4271     ai16 = 16*diag[n-1];
4272     v    = aa + ai16 + 16;
4273     for (i=n-1; i>=0;){
4274       PREFETCH_NTA(&v[8]);
4275       vi = aj + diag[i] + 1;
4276       nz = ai[i+1] - diag[i] - 1;
4277 
4278       LOAD_PS(&t[idt],XMM7);
4279 
4280       while (nz--) {
4281         PREFETCH_NTA(&v[16]);
4282         idx = 4*((unsigned int)(*vi++));
4283 
4284         /* 4x4 Matrix-Vector Product with negative accumulation: */
4285         SSE_INLINE_BEGIN_2(&t[idx],v)
4286           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4287 
4288           /* First Column */
4289           SSE_COPY_PS(XMM0,XMM6)
4290           SSE_SHUFFLE(XMM0,XMM0,0x00)
4291           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4292           SSE_SUB_PS(XMM7,XMM0)
4293 
4294           /* Second Column */
4295           SSE_COPY_PS(XMM1,XMM6)
4296           SSE_SHUFFLE(XMM1,XMM1,0x55)
4297           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4298           SSE_SUB_PS(XMM7,XMM1)
4299 
4300           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4301 
4302           /* Third Column */
4303           SSE_COPY_PS(XMM2,XMM6)
4304           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4305           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4306           SSE_SUB_PS(XMM7,XMM2)
4307 
4308           /* Fourth Column */
4309           SSE_COPY_PS(XMM3,XMM6)
4310           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4311           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4312           SSE_SUB_PS(XMM7,XMM3)
4313         SSE_INLINE_END_2
4314         v  += 16;
4315       }
4316       v    = aa + ai16;
4317       ai16 = 16*diag[--i];
4318       PREFETCH_NTA(aa+ai16+16);
4319       /*
4320          Scale the result by the diagonal 4x4 block,
4321          which was inverted as part of the factorization
4322       */
4323       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4324         /* First Column */
4325         SSE_COPY_PS(XMM0,XMM7)
4326         SSE_SHUFFLE(XMM0,XMM0,0x00)
4327         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4328 
4329         /* Second Column */
4330         SSE_COPY_PS(XMM1,XMM7)
4331         SSE_SHUFFLE(XMM1,XMM1,0x55)
4332         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4333         SSE_ADD_PS(XMM0,XMM1)
4334 
4335         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4336 
4337         /* Third Column */
4338         SSE_COPY_PS(XMM2,XMM7)
4339         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4340         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4341         SSE_ADD_PS(XMM0,XMM2)
4342 
4343         /* Fourth Column */
4344         SSE_COPY_PS(XMM3,XMM7)
4345         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4346         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4347         SSE_ADD_PS(XMM0,XMM3)
4348 
4349         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4350       SSE_INLINE_END_3
4351 
4352       v    = aa + ai16 + 16;
4353       idt -= 4;
4354     }
4355 
4356     /* Convert t from single precision back to double precision (inplace)*/
4357     idt = 4*(n-1);
4358     for (i=n-1;i>=0;i--) {
4359       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4360       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4361       PetscScalar *xtemp=&x[idt];
4362       MatScalar   *ttemp=&t[idt];
4363       xtemp[3] = (PetscScalar)ttemp[3];
4364       xtemp[2] = (PetscScalar)ttemp[2];
4365       xtemp[1] = (PetscScalar)ttemp[1];
4366       xtemp[0] = (PetscScalar)ttemp[0];
4367       idt -= 4;
4368     }
4369 
4370   } /* End of artificial scope. */
4371   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4372   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4373   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4374   SSE_SCOPE_END;
4375   PetscFunctionReturn(0);
4376 }
4377 
4378 #undef __FUNCT__
4379 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4380 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
4381 {
4382   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4383   int            *aj=a->j;
4384   PetscErrorCode ierr;
4385   int            *ai=a->i,n=a->mbs,*diag = a->diag;
4386   MatScalar      *aa=a->a;
4387   PetscScalar    *x,*b;
4388 
4389   PetscFunctionBegin;
4390   SSE_SCOPE_BEGIN;
4391   /*
4392      Note: This code currently uses demotion of double
4393      to float when performing the mixed-mode computation.
4394      This may not be numerically reasonable for all applications.
4395   */
4396   PREFETCH_NTA(aa+16*ai[1]);
4397 
4398   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4399   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4400   {
4401     /* x will first be computed in single precision then promoted inplace to double */
4402     MatScalar *v,*t=(MatScalar *)x;
4403     int       nz,i,idt,ai16;
4404     int       jdx,idx;
4405     int       *vi;
4406     /* Forward solve the lower triangular factor. */
4407 
4408     /* First block is the identity. */
4409     idx  = 0;
4410     CONVERT_DOUBLE4_FLOAT4(t,b);
4411     v    =  aa + 16*ai[1];
4412 
4413     for (i=1; i<n;) {
4414       PREFETCH_NTA(&v[8]);
4415       vi   =  aj      + ai[i];
4416       nz   =  diag[i] - ai[i];
4417       idx +=  4;
4418 
4419       /* Demote RHS from double to float. */
4420       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4421       LOAD_PS(&t[idx],XMM7);
4422 
4423       while (nz--) {
4424         PREFETCH_NTA(&v[16]);
4425         jdx = 4*(*vi++);
4426 /*          jdx = *vi++; */
4427 
4428         /* 4x4 Matrix-Vector product with negative accumulation: */
4429         SSE_INLINE_BEGIN_2(&t[jdx],v)
4430           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4431 
4432           /* First Column */
4433           SSE_COPY_PS(XMM0,XMM6)
4434           SSE_SHUFFLE(XMM0,XMM0,0x00)
4435           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4436           SSE_SUB_PS(XMM7,XMM0)
4437 
4438           /* Second Column */
4439           SSE_COPY_PS(XMM1,XMM6)
4440           SSE_SHUFFLE(XMM1,XMM1,0x55)
4441           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4442           SSE_SUB_PS(XMM7,XMM1)
4443 
4444           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4445 
4446           /* Third Column */
4447           SSE_COPY_PS(XMM2,XMM6)
4448           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4449           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4450           SSE_SUB_PS(XMM7,XMM2)
4451 
4452           /* Fourth Column */
4453           SSE_COPY_PS(XMM3,XMM6)
4454           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4455           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4456           SSE_SUB_PS(XMM7,XMM3)
4457         SSE_INLINE_END_2
4458 
4459         v  += 16;
4460       }
4461       v    =  aa + 16*ai[++i];
4462       PREFETCH_NTA(v);
4463       STORE_PS(&t[idx],XMM7);
4464     }
4465 
4466     /* Backward solve the upper triangular factor.*/
4467 
4468     idt  = 4*(n-1);
4469     ai16 = 16*diag[n-1];
4470     v    = aa + ai16 + 16;
4471     for (i=n-1; i>=0;){
4472       PREFETCH_NTA(&v[8]);
4473       vi = aj + diag[i] + 1;
4474       nz = ai[i+1] - diag[i] - 1;
4475 
4476       LOAD_PS(&t[idt],XMM7);
4477 
4478       while (nz--) {
4479         PREFETCH_NTA(&v[16]);
4480         idx = 4*(*vi++);
4481 /*          idx = *vi++; */
4482 
4483         /* 4x4 Matrix-Vector Product with negative accumulation: */
4484         SSE_INLINE_BEGIN_2(&t[idx],v)
4485           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
4486 
4487           /* First Column */
4488           SSE_COPY_PS(XMM0,XMM6)
4489           SSE_SHUFFLE(XMM0,XMM0,0x00)
4490           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
4491           SSE_SUB_PS(XMM7,XMM0)
4492 
4493           /* Second Column */
4494           SSE_COPY_PS(XMM1,XMM6)
4495           SSE_SHUFFLE(XMM1,XMM1,0x55)
4496           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
4497           SSE_SUB_PS(XMM7,XMM1)
4498 
4499           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
4500 
4501           /* Third Column */
4502           SSE_COPY_PS(XMM2,XMM6)
4503           SSE_SHUFFLE(XMM2,XMM2,0xAA)
4504           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
4505           SSE_SUB_PS(XMM7,XMM2)
4506 
4507           /* Fourth Column */
4508           SSE_COPY_PS(XMM3,XMM6)
4509           SSE_SHUFFLE(XMM3,XMM3,0xFF)
4510           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
4511           SSE_SUB_PS(XMM7,XMM3)
4512         SSE_INLINE_END_2
4513         v  += 16;
4514       }
4515       v    = aa + ai16;
4516       ai16 = 16*diag[--i];
4517       PREFETCH_NTA(aa+ai16+16);
4518       /*
4519          Scale the result by the diagonal 4x4 block,
4520          which was inverted as part of the factorization
4521       */
4522       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
4523         /* First Column */
4524         SSE_COPY_PS(XMM0,XMM7)
4525         SSE_SHUFFLE(XMM0,XMM0,0x00)
4526         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
4527 
4528         /* Second Column */
4529         SSE_COPY_PS(XMM1,XMM7)
4530         SSE_SHUFFLE(XMM1,XMM1,0x55)
4531         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
4532         SSE_ADD_PS(XMM0,XMM1)
4533 
4534         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
4535 
4536         /* Third Column */
4537         SSE_COPY_PS(XMM2,XMM7)
4538         SSE_SHUFFLE(XMM2,XMM2,0xAA)
4539         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
4540         SSE_ADD_PS(XMM0,XMM2)
4541 
4542         /* Fourth Column */
4543         SSE_COPY_PS(XMM3,XMM7)
4544         SSE_SHUFFLE(XMM3,XMM3,0xFF)
4545         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
4546         SSE_ADD_PS(XMM0,XMM3)
4547 
4548         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
4549       SSE_INLINE_END_3
4550 
4551       v    = aa + ai16 + 16;
4552       idt -= 4;
4553     }
4554 
4555     /* Convert t from single precision back to double precision (inplace)*/
4556     idt = 4*(n-1);
4557     for (i=n-1;i>=0;i--) {
4558       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4559       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4560       PetscScalar *xtemp=&x[idt];
4561       MatScalar   *ttemp=&t[idt];
4562       xtemp[3] = (PetscScalar)ttemp[3];
4563       xtemp[2] = (PetscScalar)ttemp[2];
4564       xtemp[1] = (PetscScalar)ttemp[1];
4565       xtemp[0] = (PetscScalar)ttemp[0];
4566       idt -= 4;
4567     }
4568 
4569   } /* End of artificial scope. */
4570   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4571   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4572   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4573   SSE_SCOPE_END;
4574   PetscFunctionReturn(0);
4575 }
4576 
4577 #endif
4578 
4579 #undef __FUNCT__
4580 #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4581 PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
4582 {
4583   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4584   IS                iscol=a->col,isrow=a->row;
4585   PetscErrorCode    ierr;
4586   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4587   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4588   const MatScalar   *aa=a->a,*v;
4589   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4590   const PetscScalar *b;
4591 
4592   PetscFunctionBegin;
4593   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4594   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4595   t  = a->solve_work;
4596 
4597   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4598   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4599 
4600   /* forward solve the lower triangular */
4601   idx    = 3*(*r++);
4602   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4603   for (i=1; i<n; i++) {
4604     v     = aa + 9*ai[i];
4605     vi    = aj + ai[i];
4606     nz    = diag[i] - ai[i];
4607     idx   = 3*(*r++);
4608     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4609     while (nz--) {
4610       idx   = 3*(*vi++);
4611       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4612       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4613       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4614       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4615       v += 9;
4616     }
4617     idx = 3*i;
4618     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4619   }
4620   /* backward solve the upper triangular */
4621   for (i=n-1; i>=0; i--){
4622     v    = aa + 9*diag[i] + 9;
4623     vi   = aj + diag[i] + 1;
4624     nz   = ai[i+1] - diag[i] - 1;
4625     idt  = 3*i;
4626     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4627     while (nz--) {
4628       idx   = 3*(*vi++);
4629       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4630       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4631       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4632       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4633       v += 9;
4634     }
4635     idc = 3*(*c--);
4636     v   = aa + 9*diag[i];
4637     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4638     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4639     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4640   }
4641   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4642   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4643   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4644   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4645   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4646   PetscFunctionReturn(0);
4647 }
4648 
4649 #undef __FUNCT__
4650 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4651 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
4652 {
4653   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4654   IS                iscol=a->col,isrow=a->row;
4655   PetscErrorCode    ierr;
4656   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
4657   const PetscInt    *r,*c,*rout,*cout;
4658   const MatScalar   *aa=a->a,*v;
4659   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4660   const PetscScalar *b;
4661 
4662   PetscFunctionBegin;
4663   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4664   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4665   t  = a->solve_work;
4666 
4667   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4668   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4669 
4670   /* forward solve the lower triangular */
4671   idx    = 3*r[0];
4672   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
4673   for (i=1; i<n; i++) {
4674     v     = aa + 9*ai[i];
4675     vi    = aj + ai[i];
4676     nz    = ai[i+1] - ai[i];
4677     idx   = 3*r[i];
4678     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
4679     for(m=0;m<nz;m++){
4680       idx   = 3*vi[m];
4681       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4682       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4683       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4684       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4685       v += 9;
4686     }
4687     idx = 3*i;
4688     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
4689   }
4690   /* backward solve the upper triangular */
4691   for (i=n-1; i>=0; i--){
4692     v    = aa + 9*(adiag[i+1]+1);
4693     vi   = aj + adiag[i+1]+1;
4694     nz   = adiag[i] - adiag[i+1] - 1;
4695     idt  = 3*i;
4696     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
4697     for(m=0;m<nz;m++){
4698       idx   = 3*vi[m];
4699       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4700       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4701       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4702       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4703       v += 9;
4704     }
4705     idc = 3*c[i];
4706     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4707     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4708     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4709   }
4710   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4711   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4712   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4713   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4714   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4715   PetscFunctionReturn(0);
4716 }
4717 
4718 /*
4719       Special case where the matrix was ILU(0) factored in the natural
4720    ordering. This eliminates the need for the column and row permutation.
4721 */
4722 #undef __FUNCT__
4723 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4724 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4725 {
4726   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4727   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4728   PetscErrorCode    ierr;
4729   PetscInt          *diag = a->diag;
4730   const MatScalar   *aa=a->a,*v;
4731   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4732   const PetscScalar *b;
4733   PetscInt          jdx,idt,idx,nz,*vi,i;
4734 
4735   PetscFunctionBegin;
4736   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4737   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4738 
4739   /* forward solve the lower triangular */
4740   idx    = 0;
4741   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
4742   for (i=1; i<n; i++) {
4743     v     =  aa      + 9*ai[i];
4744     vi    =  aj      + ai[i];
4745     nz    =  diag[i] - ai[i];
4746     idx   +=  3;
4747     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
4748     while (nz--) {
4749       jdx   = 3*(*vi++);
4750       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4751       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4752       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4753       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4754       v    += 9;
4755     }
4756     x[idx]   = s1;
4757     x[1+idx] = s2;
4758     x[2+idx] = s3;
4759   }
4760   /* backward solve the upper triangular */
4761   for (i=n-1; i>=0; i--){
4762     v    = aa + 9*diag[i] + 9;
4763     vi   = aj + diag[i] + 1;
4764     nz   = ai[i+1] - diag[i] - 1;
4765     idt  = 3*i;
4766     s1 = x[idt];  s2 = x[1+idt];
4767     s3 = x[2+idt];
4768     while (nz--) {
4769       idx   = 3*(*vi++);
4770       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4771       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4772       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4773       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4774       v    += 9;
4775     }
4776     v        = aa +  9*diag[i];
4777     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4778     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4779     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4780   }
4781 
4782   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4783   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4784   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
4785   PetscFunctionReturn(0);
4786 }
4787 
4788 #undef __FUNCT__
4789 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4790 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4791 {
4792     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4793     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4794     PetscErrorCode    ierr;
4795     PetscInt          idx,jdx,idt;
4796     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4797     const MatScalar   *aa=a->a,*v;
4798     PetscScalar       *x;
4799     const PetscScalar *b;
4800     PetscScalar        s1,s2,s3,x1,x2,x3;
4801 
4802     PetscFunctionBegin;
4803     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4804     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4805     /* forward solve the lower triangular */
4806     idx    = 0;
4807     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4808     for (i=1; i<n; i++) {
4809        v    = aa + bs2*ai[i];
4810        vi   = aj + ai[i];
4811        nz   = ai[i+1] - ai[i];
4812       idx   = bs*i;
4813        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4814       for(k=0;k<nz;k++){
4815          jdx   = bs*vi[k];
4816           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4817           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4818           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4819           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4820 
4821           v   +=  bs2;
4822         }
4823 
4824        x[idx]   = s1;
4825        x[1+idx] = s2;
4826        x[2+idx] = s3;
4827     }
4828 
4829    /* backward solve the upper triangular */
4830   for (i=n-1; i>=0; i--){
4831     v   = aa + bs2*(adiag[i+1]+1);
4832      vi  = aj + adiag[i+1]+1;
4833      nz  = adiag[i] - adiag[i+1]-1;
4834      idt = bs*i;
4835      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4836 
4837      for(k=0;k<nz;k++){
4838        idx   = bs*vi[k];
4839        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4840        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4841        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4842        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4843 
4844         v   +=  bs2;
4845     }
4846     /* x = inv_diagonal*x */
4847    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4848    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4849    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4850 
4851   }
4852 
4853   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4854   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4855   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4856   PetscFunctionReturn(0);
4857 }
4858 
4859 #undef __FUNCT__
4860 #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
4861 PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
4862 {
4863   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4864   IS                iscol=a->col,isrow=a->row;
4865   PetscErrorCode    ierr;
4866   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
4867   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4868   const MatScalar   *aa=a->a,*v;
4869   PetscScalar       *x,s1,s2,x1,x2,*t;
4870   const PetscScalar *b;
4871 
4872   PetscFunctionBegin;
4873   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4874   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4875   t  = a->solve_work;
4876 
4877   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4878   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
4879 
4880   /* forward solve the lower triangular */
4881   idx    = 2*(*r++);
4882   t[0] = b[idx]; t[1] = b[1+idx];
4883   for (i=1; i<n; i++) {
4884     v     = aa + 4*ai[i];
4885     vi    = aj + ai[i];
4886     nz    = diag[i] - ai[i];
4887     idx   = 2*(*r++);
4888     s1  = b[idx]; s2 = b[1+idx];
4889     while (nz--) {
4890       idx   = 2*(*vi++);
4891       x1    = t[idx]; x2 = t[1+idx];
4892       s1 -= v[0]*x1 + v[2]*x2;
4893       s2 -= v[1]*x1 + v[3]*x2;
4894       v += 4;
4895     }
4896     idx = 2*i;
4897     t[idx] = s1; t[1+idx] = s2;
4898   }
4899   /* backward solve the upper triangular */
4900   for (i=n-1; i>=0; i--){
4901     v    = aa + 4*diag[i] + 4;
4902     vi   = aj + diag[i] + 1;
4903     nz   = ai[i+1] - diag[i] - 1;
4904     idt  = 2*i;
4905     s1 = t[idt]; s2 = t[1+idt];
4906     while (nz--) {
4907       idx   = 2*(*vi++);
4908       x1    = t[idx]; x2 = t[1+idx];
4909       s1 -= v[0]*x1 + v[2]*x2;
4910       s2 -= v[1]*x1 + v[3]*x2;
4911       v += 4;
4912     }
4913     idc = 2*(*c--);
4914     v   = aa + 4*diag[i];
4915     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4916     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4917   }
4918   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4919   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4920   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4921   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4922   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4923   PetscFunctionReturn(0);
4924 }
4925 
4926 #undef __FUNCT__
4927 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4928 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
4929 {
4930   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4931   IS                iscol=a->col,isrow=a->row;
4932   PetscErrorCode    ierr;
4933   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4934   const PetscInt    *r,*c,*rout,*cout;
4935   const MatScalar   *aa=a->a,*v;
4936   PetscScalar       *x,s1,s2,x1,x2,*t;
4937   const PetscScalar *b;
4938 
4939   PetscFunctionBegin;
4940   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4941   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4942   t  = a->solve_work;
4943 
4944   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4945   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4946 
4947   /* forward solve the lower triangular */
4948   idx    = 2*r[0];
4949   t[0] = b[idx]; t[1] = b[1+idx];
4950   for (i=1; i<n; i++) {
4951     v     = aa + 4*ai[i];
4952     vi    = aj + ai[i];
4953     nz    = ai[i+1] - ai[i];
4954     idx   = 2*r[i];
4955     s1  = b[idx]; s2 = b[1+idx];
4956     for(m=0;m<nz;m++){
4957       jdx   = 2*vi[m];
4958       x1    = t[jdx]; x2 = t[1+jdx];
4959       s1 -= v[0]*x1 + v[2]*x2;
4960       s2 -= v[1]*x1 + v[3]*x2;
4961       v += 4;
4962     }
4963     idx = 2*i;
4964     t[idx] = s1; t[1+idx] = s2;
4965   }
4966   /* backward solve the upper triangular */
4967   for (i=n-1; i>=0; i--){
4968     v    = aa + 4*(adiag[i+1]+1);
4969     vi   = aj + adiag[i+1]+1;
4970     nz   = adiag[i] - adiag[i+1] - 1;
4971     idt  = 2*i;
4972     s1 = t[idt]; s2 = t[1+idt];
4973     for(m=0;m<nz;m++){
4974       idx   = 2*vi[m];
4975       x1    = t[idx]; x2 = t[1+idx];
4976       s1 -= v[0]*x1 + v[2]*x2;
4977       s2 -= v[1]*x1 + v[3]*x2;
4978       v += 4;
4979     }
4980     idc = 2*c[i];
4981     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4982     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4983   }
4984   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4985   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4986   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4987   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4988   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4989   PetscFunctionReturn(0);
4990 }
4991 
4992 /*
4993       Special case where the matrix was ILU(0) factored in the natural
4994    ordering. This eliminates the need for the column and row permutation.
4995 */
4996 #undef __FUNCT__
4997 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
4998 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
4999 {
5000   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5001   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
5002   PetscErrorCode    ierr;
5003   PetscInt          *diag = a->diag;
5004   const MatScalar   *aa=a->a,*v;
5005   PetscScalar       *x,s1,s2,x1,x2;
5006   const PetscScalar *b;
5007   PetscInt          jdx,idt,idx,nz,*vi,i;
5008 
5009   PetscFunctionBegin;
5010   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5011   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5012 
5013   /* forward solve the lower triangular */
5014   idx    = 0;
5015   x[0]   = b[0]; x[1] = b[1];
5016   for (i=1; i<n; i++) {
5017     v     =  aa      + 4*ai[i];
5018     vi    =  aj      + ai[i];
5019     nz    =  diag[i] - ai[i];
5020     idx   +=  2;
5021     s1  =  b[idx];s2 = b[1+idx];
5022     while (nz--) {
5023       jdx   = 2*(*vi++);
5024       x1    = x[jdx];x2 = x[1+jdx];
5025       s1 -= v[0]*x1 + v[2]*x2;
5026       s2 -= v[1]*x1 + v[3]*x2;
5027       v    += 4;
5028     }
5029     x[idx]   = s1;
5030     x[1+idx] = s2;
5031   }
5032   /* backward solve the upper triangular */
5033   for (i=n-1; i>=0; i--){
5034     v    = aa + 4*diag[i] + 4;
5035     vi   = aj + diag[i] + 1;
5036     nz   = ai[i+1] - diag[i] - 1;
5037     idt  = 2*i;
5038     s1 = x[idt];  s2 = x[1+idt];
5039     while (nz--) {
5040       idx   = 2*(*vi++);
5041       x1    = x[idx];   x2 = x[1+idx];
5042       s1 -= v[0]*x1 + v[2]*x2;
5043       s2 -= v[1]*x1 + v[3]*x2;
5044       v    += 4;
5045     }
5046     v        = aa +  4*diag[i];
5047     x[idt]   = v[0]*s1 + v[2]*s2;
5048     x[1+idt] = v[1]*s1 + v[3]*s2;
5049   }
5050 
5051   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5052   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5053   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5054   PetscFunctionReturn(0);
5055 }
5056 
5057 #undef __FUNCT__
5058 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
5059 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5060 {
5061     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5062     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
5063     PetscErrorCode    ierr;
5064     PetscInt          jdx;
5065     const MatScalar   *aa=a->a,*v;
5066     PetscScalar       *x,s1,s2,x1,x2;
5067     const PetscScalar *b;
5068 
5069     PetscFunctionBegin;
5070     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5071     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5072     /* forward solve the lower triangular */
5073     idx    = 0;
5074     x[0] = b[idx]; x[1] = b[1+idx];
5075     for (i=1; i<n; i++) {
5076         v   = aa + 4*ai[i];
5077        vi   = aj + ai[i];
5078        nz   = ai[i+1] - ai[i];
5079        idx  = 2*i;
5080        s1   = b[idx];s2 = b[1+idx];
5081       for(k=0;k<nz;k++){
5082          jdx   = 2*vi[k];
5083           x1    = x[jdx];x2 = x[1+jdx];
5084           s1   -= v[0]*x1 + v[2]*x2;
5085           s2   -= v[1]*x1 + v[3]*x2;
5086            v   +=  4;
5087         }
5088        x[idx]   = s1;
5089        x[1+idx] = s2;
5090     }
5091 
5092    /* backward solve the upper triangular */
5093   for (i=n-1; i>=0; i--){
5094      v   = aa + 4*(adiag[i+1]+1);
5095      vi  = aj + adiag[i+1]+1;
5096      nz  = adiag[i] - adiag[i+1]-1;
5097      idt = 2*i;
5098      s1 = x[idt];  s2 = x[1+idt];
5099      for(k=0;k<nz;k++){
5100       idx   = 2*vi[k];
5101        x1    = x[idx];   x2 = x[1+idx];
5102        s1 -= v[0]*x1 + v[2]*x2;
5103        s2 -= v[1]*x1 + v[3]*x2;
5104          v    += 4;
5105     }
5106     /* x = inv_diagonal*x */
5107    x[idt]   = v[0]*s1 + v[2]*s2;
5108    x[1+idt] = v[1]*s1 + v[3]*s2;
5109   }
5110 
5111   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5112   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5113   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5114   PetscFunctionReturn(0);
5115 }
5116 
5117 #undef __FUNCT__
5118 #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
5119 PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
5120 {
5121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5122   IS             iscol=a->col,isrow=a->row;
5123   PetscErrorCode ierr;
5124   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
5125   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
5126   MatScalar      *aa=a->a,*v;
5127   PetscScalar    *x,*b,s1,*t;
5128 
5129   PetscFunctionBegin;
5130   if (!n) PetscFunctionReturn(0);
5131 
5132   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5133   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5134   t  = a->solve_work;
5135 
5136   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5137   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
5138 
5139   /* forward solve the lower triangular */
5140   t[0] = b[*r++];
5141   for (i=1; i<n; i++) {
5142     v     = aa + ai[i];
5143     vi    = aj + ai[i];
5144     nz    = diag[i] - ai[i];
5145     s1  = b[*r++];
5146     while (nz--) {
5147       s1 -= (*v++)*t[*vi++];
5148     }
5149     t[i] = s1;
5150   }
5151   /* backward solve the upper triangular */
5152   for (i=n-1; i>=0; i--){
5153     v    = aa + diag[i] + 1;
5154     vi   = aj + diag[i] + 1;
5155     nz   = ai[i+1] - diag[i] - 1;
5156     s1 = t[i];
5157     while (nz--) {
5158       s1 -= (*v++)*t[*vi++];
5159     }
5160     x[*c--] = t[i] = aa[diag[i]]*s1;
5161   }
5162 
5163   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5164   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5165   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5166   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5167   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5168   PetscFunctionReturn(0);
5169 }
5170 /*
5171       Special case where the matrix was ILU(0) factored in the natural
5172    ordering. This eliminates the need for the column and row permutation.
5173 */
5174 #undef __FUNCT__
5175 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5176 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
5177 {
5178   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5179   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
5180   PetscErrorCode ierr;
5181   PetscInt       *diag = a->diag;
5182   MatScalar      *aa=a->a;
5183   PetscScalar    *x,*b;
5184   PetscScalar    s1,x1;
5185   MatScalar      *v;
5186   PetscInt       jdx,idt,idx,nz,*vi,i;
5187 
5188   PetscFunctionBegin;
5189   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5190   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5191 
5192   /* forward solve the lower triangular */
5193   idx    = 0;
5194   x[0]   = b[0];
5195   for (i=1; i<n; i++) {
5196     v     =  aa      + ai[i];
5197     vi    =  aj      + ai[i];
5198     nz    =  diag[i] - ai[i];
5199     idx   +=  1;
5200     s1  =  b[idx];
5201     while (nz--) {
5202       jdx   = *vi++;
5203       x1    = x[jdx];
5204       s1 -= v[0]*x1;
5205       v    += 1;
5206     }
5207     x[idx]   = s1;
5208   }
5209   /* backward solve the upper triangular */
5210   for (i=n-1; i>=0; i--){
5211     v    = aa + diag[i] + 1;
5212     vi   = aj + diag[i] + 1;
5213     nz   = ai[i+1] - diag[i] - 1;
5214     idt  = i;
5215     s1 = x[idt];
5216     while (nz--) {
5217       idx   = *vi++;
5218       x1    = x[idx];
5219       s1 -= v[0]*x1;
5220       v    += 1;
5221     }
5222     v        = aa +  diag[i];
5223     x[idt]   = v[0]*s1;
5224   }
5225   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5226   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5227   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
5228   PetscFunctionReturn(0);
5229 }
5230 
5231 /* ----------------------------------------------------------------*/
5232 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
5233 //EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_inplace(Mat,PetscTruth);
5234 //EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
5235 
5236 /* bs = 15 for PFLOTRAN */
5237 #undef __FUNCT__
5238 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15"
5239 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15(Mat B,Mat A,const MatFactorInfo *info)
5240 {
5241   Mat            C=B;
5242   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5243   IS             isrow = b->row,isicol = b->icol;
5244   PetscErrorCode ierr;
5245   const PetscInt *r,*ic,*ics;
5246   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5247   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj,*v_pivots;
5248   MatScalar      *rtmp,*pc,*mwork,*v,*v_work,*pv,*aa=a->a;
5249   PetscInt       bs2 = a->bs2,bs=A->rmap->bs,flg;
5250   PetscReal      shift = info->shiftinblocks;
5251 
5252   PetscFunctionBegin;
5253   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5254   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5255 
5256 
5257   /* generate work space needed by the factorization */
5258   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
5259   ierr = PetscMalloc2(bs,MatScalar,&v_work,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5260   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5261   ics  = ic;
5262 
5263   for (i=0; i<n; i++){
5264     /* zero rtmp */
5265     /* L part */
5266     nz    = bi[i+1] - bi[i];
5267     bjtmp = bj + bi[i];
5268     for  (j=0; j<nz; j++){
5269       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5270     }
5271 
5272     /* U part */
5273     nz = bdiag[i] - bdiag[i+1];
5274     bjtmp = bj + bdiag[i+1]+1;
5275     for  (j=0; j<nz; j++){
5276       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5277     }
5278 
5279     /* load in initial (unfactored row) */
5280     nz    = ai[r[i]+1] - ai[r[i]];
5281     ajtmp = aj + ai[r[i]];
5282     v     = aa + bs2*ai[r[i]];
5283     for (j=0; j<nz; j++) {
5284       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5285     }
5286 
5287     /* elimination */
5288     bjtmp = bj + bi[i];
5289     nzL   = bi[i+1] - bi[i];
5290     for(k=0;k < nzL;k++) {
5291       row = bjtmp[k];
5292       pc = rtmp + bs2*row;
5293       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5294       if (flg) {
5295         pv = b->a + bs2*bdiag[row];
5296 	/*   ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr); */
5297 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5298 
5299         pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5300         pv = b->a + bs2*(bdiag[row+1]+1);
5301         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5302         for (j=0; j<nz; j++) {
5303           v    = rtmp + bs2*pj[j];
5304 	  /*   ierr = Kernel_A_gets_A_minus_B_times_C_15(v,pc,pv);CHKERRQ(ierr); */
5305 	  Kernel_A_gets_A_minus_B_times_C(bs,v,pc,pv);
5306 	  pv  += bs2;
5307         }
5308         ierr = PetscLogFlops(2*bs2*bs*nz+2*bs2*bs-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5309       }
5310     }
5311 
5312     /* finished row so stick it into b->a */
5313     /* L part */
5314     pv   = b->a + bs2*bi[i] ;
5315     pj   = b->j + bi[i] ;
5316     nz   = bi[i+1] - bi[i];
5317     for (j=0; j<nz; j++) {
5318       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5319     }
5320 
5321     /* Mark diagonal and invert diagonal for simplier triangular solves */
5322     pv   = b->a + bs2*bdiag[i];
5323     pj   = b->j + bdiag[i];
5324     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5325     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5326 							    /*ierr = Kernel_A_gets_inverse_A_7(pv,shift);CHKERRQ(ierr); */
5327 
5328     /* U part */
5329     pv = b->a + bs2*(bdiag[i+1]+1);
5330     pj = b->j + bdiag[i+1]+1;
5331     nz = bdiag[i] - bdiag[i+1] - 1;
5332     for (j=0; j<nz; j++){
5333       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5334     }
5335   }
5336 
5337   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5338   ierr = PetscFree2(v_work,v_pivots);CHKERRQ(ierr);
5339   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5340   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5341   C->ops->solve          = MatSolve_SeqBAIJ_15;
5342   C->ops->solvetranspose = 0;
5343   C->assembled = PETSC_TRUE;
5344   ierr = PetscLogFlops(1.3333*bs2*n);CHKERRQ(ierr); /* from inverting diagonal blocks */
5345   PetscFunctionReturn(0);
5346 }
5347 
5348 #undef __FUNCT__
5349 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
5350 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
5351 {
5352   Mat            C=B;
5353   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
5354   IS             isrow = b->row,isicol = b->icol;
5355   PetscErrorCode ierr;
5356   const PetscInt *r,*ic,*ics;
5357   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
5358   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5359   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5360   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5361   MatScalar      *v_work;
5362   PetscTruth     col_identity,row_identity,both_identity;
5363 
5364   PetscFunctionBegin;
5365   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5366   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5367 
5368   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5369   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
5370   ics  = ic;
5371 
5372   /* generate work space needed by dense LU factorization */
5373   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5374 
5375   for (i=0; i<n; i++){
5376     /* zero rtmp */
5377     /* L part */
5378     nz    = bi[i+1] - bi[i];
5379     bjtmp = bj + bi[i];
5380     for  (j=0; j<nz; j++){
5381       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5382     }
5383 
5384     /* U part */
5385     nz = bdiag[i] - bdiag[i+1];
5386     bjtmp = bj + bdiag[i+1]+1;
5387     for  (j=0; j<nz; j++){
5388       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5389     }
5390 
5391     /* load in initial (unfactored row) */
5392     nz    = ai[r[i]+1] - ai[r[i]];
5393     ajtmp = aj + ai[r[i]];
5394     v     = aa + bs2*ai[r[i]];
5395     for (j=0; j<nz; j++) {
5396       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
5397     }
5398 
5399     /* elimination */
5400     bjtmp = bj + bi[i];
5401     nzL   = bi[i+1] - bi[i];
5402     for(k=0;k < nzL;k++) {
5403       row = bjtmp[k];
5404       pc = rtmp + bs2*row;
5405       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5406       if (flg) {
5407         pv         = b->a + bs2*bdiag[row];
5408         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
5409         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
5410         pv         = b->a + bs2*(bdiag[row+1]+1);
5411         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
5412         for (j=0; j<nz; j++) {
5413           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5414         }
5415         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
5416       }
5417     }
5418 
5419     /* finished row so stick it into b->a */
5420     /* L part */
5421     pv   = b->a + bs2*bi[i] ;
5422     pj   = b->j + bi[i] ;
5423     nz   = bi[i+1] - bi[i];
5424     for (j=0; j<nz; j++) {
5425       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5426     }
5427 
5428     /* Mark diagonal and invert diagonal for simplier triangular solves */
5429     pv  = b->a + bs2*bdiag[i];
5430     pj  = b->j + bdiag[i];
5431     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5432     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5433     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
5434 
5435     /* U part */
5436     pv = b->a + bs2*(bdiag[i+1]+1);
5437     pj = b->j + bdiag[i+1]+1;
5438     nz = bdiag[i] - bdiag[i+1] - 1;
5439     for (j=0; j<nz; j++){
5440       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5441     }
5442   }
5443 
5444   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5445   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
5446   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5447   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5448 
5449   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5450   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5451   both_identity = (PetscTruth) (row_identity && col_identity);
5452   if (both_identity){
5453     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5454   } else {
5455     C->ops->solve = MatSolve_SeqBAIJ_N;
5456   }
5457   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5458 
5459   C->assembled = PETSC_TRUE;
5460   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
5461   PetscFunctionReturn(0);
5462 }
5463 
5464 /*
5465    ilu(0) with natural ordering under new data structure.
5466    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
5467    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
5468 */
5469 
5470 #undef __FUNCT__
5471 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
5472 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5473 {
5474 
5475   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5476   PetscErrorCode     ierr;
5477   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5478   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5479 
5480   PetscFunctionBegin;
5481   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5482   b    = (Mat_SeqBAIJ*)(fact)->data;
5483 
5484   /* allocate matrix arrays for new data structure */
5485   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5486   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5487   b->singlemalloc = PETSC_TRUE;
5488   if (!b->diag){
5489     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5490     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5491   }
5492   bdiag = b->diag;
5493 
5494   if (n > 0) {
5495     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5496   }
5497 
5498   /* set bi and bj with new data structure */
5499   bi = b->i;
5500   bj = b->j;
5501 
5502   /* L part */
5503   bi[0] = 0;
5504   for (i=0; i<n; i++){
5505     nz = adiag[i] - ai[i];
5506     bi[i+1] = bi[i] + nz;
5507     aj = a->j + ai[i];
5508     for (j=0; j<nz; j++){
5509       *bj = aj[j]; bj++;
5510     }
5511   }
5512 
5513   /* U part */
5514   bi_temp = bi[n];
5515   bdiag[n] = bi[n]-1;
5516   for (i=n-1; i>=0; i--){
5517     nz = ai[i+1] - adiag[i] - 1;
5518     bi_temp = bi_temp + nz + 1;
5519     aj = a->j + adiag[i] + 1;
5520     for (j=0; j<nz; j++){
5521       *bj = aj[j]; bj++;
5522     }
5523     /* diag[i] */
5524     *bj = i; bj++;
5525     bdiag[i] = bi_temp - 1;
5526   }
5527   PetscFunctionReturn(0);
5528 }
5529 
5530 #undef __FUNCT__
5531 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
5532 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5533 {
5534   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5535   IS                 isicol;
5536   PetscErrorCode     ierr;
5537   const PetscInt     *r,*ic;
5538   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5539   PetscInt           *bi,*cols,nnz,*cols_lvl;
5540   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5541   PetscInt           i,levels,diagonal_fill;
5542   PetscTruth         col_identity,row_identity,both_identity;
5543   PetscReal          f;
5544   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5545   PetscBT            lnkbt;
5546   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5547   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5548   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5549   PetscTruth         missing;
5550   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5551   PetscTruth         olddatastruct = PETSC_FALSE;
5552 
5553   PetscFunctionBegin;
5554   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&olddatastruct,PETSC_NULL);CHKERRQ(ierr);
5555   if (olddatastruct){
5556     ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5557     PetscFunctionReturn(0);
5558   }
5559   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5560   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5561   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5562 
5563   f             = info->fill;
5564   levels        = (PetscInt)info->levels;
5565   diagonal_fill = (PetscInt)info->diagonal_fill;
5566   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5567 
5568   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5569   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5570   both_identity = (PetscTruth) (row_identity && col_identity);
5571 
5572   if (!levels && both_identity) {
5573     /* special case: ilu(0) with natural ordering */
5574     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5575     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5576 
5577     fact->factor = MAT_FACTOR_ILU;
5578     (fact)->info.factor_mallocs    = 0;
5579     (fact)->info.fill_ratio_given  = info->fill;
5580     (fact)->info.fill_ratio_needed = 1.0;
5581     b                = (Mat_SeqBAIJ*)(fact)->data;
5582     b->row           = isrow;
5583     b->col           = iscol;
5584     b->icol          = isicol;
5585     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5586     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5587     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5588     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5589     PetscFunctionReturn(0);
5590   }
5591 
5592   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5593   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5594 
5595   /* get new row pointers */
5596   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5597   bi[0] = 0;
5598   /* bdiag is location of diagonal in factor */
5599   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5600   bdiag[0]  = 0;
5601 
5602   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
5603 
5604   /* create a linked list for storing column indices of the active row */
5605   nlnk = n + 1;
5606   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5607 
5608   /* initial FreeSpace size is f*(ai[n]+1) */
5609   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5610   current_space = free_space;
5611   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5612   current_space_lvl = free_space_lvl;
5613 
5614   for (i=0; i<n; i++) {
5615     nzi = 0;
5616     /* copy current row into linked list */
5617     nnz  = ai[r[i]+1] - ai[r[i]];
5618     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5619     cols = aj + ai[r[i]];
5620     lnk[i] = -1; /* marker to indicate if diagonal exists */
5621     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5622     nzi += nlnk;
5623 
5624     /* make sure diagonal entry is included */
5625     if (diagonal_fill && lnk[i] == -1) {
5626       fm = n;
5627       while (lnk[fm] < i) fm = lnk[fm];
5628       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5629       lnk[fm]    = i;
5630       lnk_lvl[i] = 0;
5631       nzi++; dcount++;
5632     }
5633 
5634     /* add pivot rows into the active row */
5635     nzbd = 0;
5636     prow = lnk[n];
5637     while (prow < i) {
5638       nnz      = bdiag[prow];
5639       cols     = bj_ptr[prow] + nnz + 1;
5640       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5641       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5642       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5643       nzi += nlnk;
5644       prow = lnk[prow];
5645       nzbd++;
5646     }
5647     bdiag[i] = nzbd;
5648     bi[i+1]  = bi[i] + nzi;
5649 
5650     /* if free space is not available, make more free space */
5651     if (current_space->local_remaining<nzi) {
5652       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5653       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5654       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5655       reallocs++;
5656     }
5657 
5658     /* copy data into free_space and free_space_lvl, then initialize lnk */
5659     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5660     bj_ptr[i]    = current_space->array;
5661     bjlvl_ptr[i] = current_space_lvl->array;
5662 
5663     /* make sure the active row i has diagonal entry */
5664     if (*(bj_ptr[i]+bdiag[i]) != i) {
5665       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5666     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5667     }
5668 
5669     current_space->array           += nzi;
5670     current_space->local_used      += nzi;
5671     current_space->local_remaining -= nzi;
5672     current_space_lvl->array           += nzi;
5673     current_space_lvl->local_used      += nzi;
5674     current_space_lvl->local_remaining -= nzi;
5675   }
5676 
5677   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5678   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5679 
5680   /* destroy list of free space and other temporary arrays */
5681   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5682 
5683   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5684   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5685 
5686   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5687   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5688   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
5689 
5690 #if defined(PETSC_USE_INFO)
5691   {
5692     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5693     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5694     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5695     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5696     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5697     if (diagonal_fill) {
5698       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5699     }
5700   }
5701 #endif
5702 
5703   /* put together the new matrix */
5704   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5705   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5706   b = (Mat_SeqBAIJ*)(fact)->data;
5707   b->free_a       = PETSC_TRUE;
5708   b->free_ij      = PETSC_TRUE;
5709   b->singlemalloc = PETSC_FALSE;
5710   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5711   b->j          = bj;
5712   b->i          = bi;
5713   b->diag       = bdiag;
5714   b->free_diag  = PETSC_TRUE;
5715   b->ilen       = 0;
5716   b->imax       = 0;
5717   b->row        = isrow;
5718   b->col        = iscol;
5719   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5720   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5721   b->icol       = isicol;
5722   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5723   /* In b structure:  Free imax, ilen, old a, old j.
5724      Allocate bdiag, solve_work, new a, new j */
5725   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5726   b->maxnz = b->nz = bdiag[0]+1;
5727   fact->info.factor_mallocs    = reallocs;
5728   fact->info.fill_ratio_given  = f;
5729   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5730   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
5731   PetscFunctionReturn(0);
5732 }
5733 
5734 
5735 /*
5736      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
5737    except that the data structure of Mat_SeqAIJ is slightly different.
5738    Not a good example of code reuse.
5739 */
5740 #undef __FUNCT__
5741 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5742 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5743 {
5744   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
5745   IS             isicol;
5746   PetscErrorCode ierr;
5747   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
5748   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5749   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5750   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
5751   PetscTruth     col_identity,row_identity,both_identity,flg;
5752   PetscReal      f;
5753 
5754   PetscFunctionBegin;
5755   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5756   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
5757 
5758   f             = info->fill;
5759   levels        = (PetscInt)info->levels;
5760   diagonal_fill = (PetscInt)info->diagonal_fill;
5761   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5762 
5763   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5764   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5765   both_identity = (PetscTruth) (row_identity && col_identity);
5766 
5767   if (!levels && both_identity) {  /* special case copy the nonzero structure */
5768     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
5769     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5770 
5771     fact->factor = MAT_FACTOR_ILU;
5772     b            = (Mat_SeqBAIJ*)fact->data;
5773     b->row       = isrow;
5774     b->col       = iscol;
5775     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5776     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5777     b->icol      = isicol;
5778     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5779     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5780     PetscFunctionReturn(0);
5781   }
5782 
5783   /* general case perform the symbolic factorization */
5784     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5785     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5786 
5787     /* get new row pointers */
5788     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
5789     ainew[0] = 0;
5790     /* don't know how many column pointers are needed so estimate */
5791     jmax = (PetscInt)(f*ai[n] + 1);
5792     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
5793     /* ajfill is level of fill for each fill entry */
5794     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
5795     /* fill is a linked list of nonzeros in active row */
5796     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
5797     /* im is level for each filled value */
5798     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
5799     /* dloc is location of diagonal in factor */
5800     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
5801     dloc[0]  = 0;
5802     for (prow=0; prow<n; prow++) {
5803 
5804       /* copy prow into linked list */
5805       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
5806       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
5807       xi         = aj + ai[r[prow]];
5808       fill[n]    = n;
5809       fill[prow] = -1; /* marker for diagonal entry */
5810       while (nz--) {
5811 	fm  = n;
5812 	idx = ic[*xi++];
5813 	do {
5814 	  m  = fm;
5815 	  fm = fill[m];
5816 	} while (fm < idx);
5817 	fill[m]   = idx;
5818 	fill[idx] = fm;
5819 	im[idx]   = 0;
5820       }
5821 
5822       /* make sure diagonal entry is included */
5823       if (diagonal_fill && fill[prow] == -1) {
5824 	fm = n;
5825 	while (fill[fm] < prow) fm = fill[fm];
5826 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5827 	fill[fm]   = prow;
5828 	im[prow]   = 0;
5829 	nzf++;
5830 	dcount++;
5831       }
5832 
5833       nzi = 0;
5834       row = fill[n];
5835       while (row < prow) {
5836 	incrlev = im[row] + 1;
5837 	nz      = dloc[row];
5838 	xi      = ajnew  + ainew[row] + nz + 1;
5839 	flev    = ajfill + ainew[row] + nz + 1;
5840 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
5841 	fm      = row;
5842 	while (nnz-- > 0) {
5843 	  idx = *xi++;
5844 	  if (*flev + incrlev > levels) {
5845 	    flev++;
5846 	    continue;
5847 	  }
5848 	  do {
5849 	    m  = fm;
5850 	    fm = fill[m];
5851 	  } while (fm < idx);
5852 	  if (fm != idx) {
5853 	    im[idx]   = *flev + incrlev;
5854 	    fill[m]   = idx;
5855 	    fill[idx] = fm;
5856 	    fm        = idx;
5857 	    nzf++;
5858 	  } else {
5859 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
5860 	  }
5861 	  flev++;
5862 	}
5863 	row = fill[row];
5864 	nzi++;
5865       }
5866       /* copy new filled row into permanent storage */
5867       ainew[prow+1] = ainew[prow] + nzf;
5868       if (ainew[prow+1] > jmax) {
5869 
5870 	/* estimate how much additional space we will need */
5871 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5872 	/* just double the memory each time */
5873 	PetscInt maxadd = jmax;
5874 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
5875 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
5876 	jmax += maxadd;
5877 
5878 	/* allocate a longer ajnew and ajfill */
5879 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5880 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5881 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
5882 	ajnew = xitmp;
5883 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
5884 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5885 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
5886 	ajfill = xitmp;
5887 	reallocate++; /* count how many reallocations are needed */
5888       }
5889       xitmp       = ajnew + ainew[prow];
5890       flev        = ajfill + ainew[prow];
5891       dloc[prow]  = nzi;
5892       fm          = fill[n];
5893       while (nzf--) {
5894 	*xitmp++ = fm;
5895 	*flev++ = im[fm];
5896 	fm      = fill[fm];
5897       }
5898       /* make sure row has diagonal entry */
5899       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
5900 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5901     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5902       }
5903     }
5904     ierr = PetscFree(ajfill);CHKERRQ(ierr);
5905     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5906     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5907     ierr = PetscFree(fill);CHKERRQ(ierr);
5908     ierr = PetscFree(im);CHKERRQ(ierr);
5909 
5910 #if defined(PETSC_USE_INFO)
5911     {
5912       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5913       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5914       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5915       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5916       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5917       if (diagonal_fill) {
5918 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5919       }
5920     }
5921 #endif
5922 
5923     /* put together the new matrix */
5924     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5925     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5926     b    = (Mat_SeqBAIJ*)fact->data;
5927     b->free_a       = PETSC_TRUE;
5928     b->free_ij      = PETSC_TRUE;
5929     b->singlemalloc = PETSC_FALSE;
5930     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5931     b->j          = ajnew;
5932     b->i          = ainew;
5933     for (i=0; i<n; i++) dloc[i] += ainew[i];
5934     b->diag       = dloc;
5935     b->free_diag  = PETSC_TRUE;
5936     b->ilen       = 0;
5937     b->imax       = 0;
5938     b->row        = isrow;
5939     b->col        = iscol;
5940     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5941     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5942     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5943     b->icol       = isicol;
5944     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5945     /* In b structure:  Free imax, ilen, old a, old j.
5946        Allocate dloc, solve_work, new a, new j */
5947     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
5948     b->maxnz          = b->nz = ainew[n];
5949 
5950     fact->info.factor_mallocs    = reallocate;
5951     fact->info.fill_ratio_given  = f;
5952     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
5953 
5954   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
5955   PetscFunctionReturn(0);
5956 }
5957 
5958 #undef __FUNCT__
5959 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5960 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
5961 {
5962   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
5963   /* int i,*AJ=a->j,nz=a->nz; */
5964   PetscFunctionBegin;
5965   /* Undo Column scaling */
5966 /*    while (nz--) { */
5967 /*      AJ[i] = AJ[i]/4; */
5968 /*    } */
5969   /* This should really invoke a push/pop logic, but we don't have that yet. */
5970   A->ops->setunfactored = PETSC_NULL;
5971   PetscFunctionReturn(0);
5972 }
5973 
5974 #undef __FUNCT__
5975 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5976 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
5977 {
5978   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5979   PetscInt       *AJ=a->j,nz=a->nz;
5980   unsigned short *aj=(unsigned short *)AJ;
5981   PetscFunctionBegin;
5982   /* Is this really necessary? */
5983   while (nz--) {
5984     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
5985   }
5986   A->ops->setunfactored = PETSC_NULL;
5987   PetscFunctionReturn(0);
5988 }
5989 
5990 
5991