xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 8423f3f12a3f6cd04020199c9ea95a7dbab15a0a)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16   PetscErrorCode ierr;
17   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   PetscInt       *diag = a->diag;
19   MatScalar      *aa=a->a,*v;
20   PetscScalar    s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26 
27   /* forward solve the U^T */
28   for (i=0; i<n; i++) {
29 
30     v     = aa + diag[i];
31     /* multiply by the inverse of the block diagonal */
32     s1    = (*v++)*x[i];
33     vi    = aj + diag[i] + 1;
34     nz    = ai[i+1] - diag[i] - 1;
35     while (nz--) {
36       x[*vi++]  -= (*v++)*s1;
37     }
38     x[i]   = s1;
39   }
40   /* backward solve the L^T */
41   for (i=n-1; i>=0; i--){
42     v    = aa + diag[i] - 1;
43     vi   = aj + diag[i] - 1;
44     nz   = diag[i] - ai[i];
45     s1   = x[i];
46     while (nz--) {
47       x[*vi--]   -=  (*v--)*s1;
48     }
49   }
50   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
51   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53   PetscFunctionReturn(0);
54 }
55 
56 #undef __FUNCT__
57 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59 {
60   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61   PetscErrorCode ierr;
62   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63   PetscInt       *diag = a->diag,oidx;
64   MatScalar      *aa=a->a,*v;
65   PetscScalar    s1,s2,x1,x2;
66   PetscScalar    *x,*b;
67 
68   PetscFunctionBegin;
69   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
70   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
71   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72 
73   /* forward solve the U^T */
74   idx = 0;
75   for (i=0; i<n; i++) {
76 
77     v     = aa + 4*diag[i];
78     /* multiply by the inverse of the block diagonal */
79     x1 = x[idx];   x2 = x[1+idx];
80     s1 = v[0]*x1  +  v[1]*x2;
81     s2 = v[2]*x1  +  v[3]*x2;
82     v += 4;
83 
84     vi    = aj + diag[i] + 1;
85     nz    = ai[i+1] - diag[i] - 1;
86     while (nz--) {
87       oidx = 2*(*vi++);
88       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90       v  += 4;
91     }
92     x[idx]   = s1;x[1+idx] = s2;
93     idx += 2;
94   }
95   /* backward solve the L^T */
96   for (i=n-1; i>=0; i--){
97     v    = aa + 4*diag[i] - 4;
98     vi   = aj + diag[i] - 1;
99     nz   = diag[i] - ai[i];
100     idt  = 2*i;
101     s1   = x[idt];  s2 = x[1+idt];
102     while (nz--) {
103       idx   = 2*(*vi--);
104       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106       v -= 4;
107     }
108   }
109   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
110   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
112   PetscFunctionReturn(0);
113 }
114 
115 #undef __FUNCT__
116 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118 {
119   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120   PetscErrorCode ierr;
121   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122   PetscInt       *diag = a->diag,oidx;
123   MatScalar      *aa=a->a,*v;
124   PetscScalar    s1,s2,s3,x1,x2,x3;
125   PetscScalar    *x,*b;
126 
127   PetscFunctionBegin;
128   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
129   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
130   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131 
132   /* forward solve the U^T */
133   idx = 0;
134   for (i=0; i<n; i++) {
135 
136     v     = aa + 9*diag[i];
137     /* multiply by the inverse of the block diagonal */
138     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142     v += 9;
143 
144     vi    = aj + diag[i] + 1;
145     nz    = ai[i+1] - diag[i] - 1;
146     while (nz--) {
147       oidx = 3*(*vi++);
148       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151       v  += 9;
152     }
153     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154     idx += 3;
155   }
156   /* backward solve the L^T */
157   for (i=n-1; i>=0; i--){
158     v    = aa + 9*diag[i] - 9;
159     vi   = aj + diag[i] - 1;
160     nz   = diag[i] - ai[i];
161     idt  = 3*i;
162     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163     while (nz--) {
164       idx   = 3*(*vi--);
165       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168       v -= 9;
169     }
170   }
171   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
172   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
174   PetscFunctionReturn(0);
175 }
176 
177 #undef __FUNCT__
178 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180 {
181   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182   PetscErrorCode ierr;
183   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184   PetscInt       *diag = a->diag,oidx;
185   MatScalar      *aa=a->a,*v;
186   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
187   PetscScalar    *x,*b;
188 
189   PetscFunctionBegin;
190   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
191   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
192   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193 
194   /* forward solve the U^T */
195   idx = 0;
196   for (i=0; i<n; i++) {
197 
198     v     = aa + 16*diag[i];
199     /* multiply by the inverse of the block diagonal */
200     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205     v += 16;
206 
207     vi    = aj + diag[i] + 1;
208     nz    = ai[i+1] - diag[i] - 1;
209     while (nz--) {
210       oidx = 4*(*vi++);
211       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215       v  += 16;
216     }
217     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218     idx += 4;
219   }
220   /* backward solve the L^T */
221   for (i=n-1; i>=0; i--){
222     v    = aa + 16*diag[i] - 16;
223     vi   = aj + diag[i] - 1;
224     nz   = diag[i] - ai[i];
225     idt  = 4*i;
226     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227     while (nz--) {
228       idx   = 4*(*vi--);
229       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233       v -= 16;
234     }
235   }
236   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
237   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
239   PetscFunctionReturn(0);
240 }
241 
242 #undef __FUNCT__
243 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245 {
246   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247   PetscErrorCode ierr;
248   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249   PetscInt       *diag = a->diag,oidx;
250   MatScalar      *aa=a->a,*v;
251   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
252   PetscScalar    *x,*b;
253 
254   PetscFunctionBegin;
255   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
256   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
257   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258 
259   /* forward solve the U^T */
260   idx = 0;
261   for (i=0; i<n; i++) {
262 
263     v     = aa + 25*diag[i];
264     /* multiply by the inverse of the block diagonal */
265     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271     v += 25;
272 
273     vi    = aj + diag[i] + 1;
274     nz    = ai[i+1] - diag[i] - 1;
275     while (nz--) {
276       oidx = 5*(*vi++);
277       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282       v  += 25;
283     }
284     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285     idx += 5;
286   }
287   /* backward solve the L^T */
288   for (i=n-1; i>=0; i--){
289     v    = aa + 25*diag[i] - 25;
290     vi   = aj + diag[i] - 1;
291     nz   = diag[i] - ai[i];
292     idt  = 5*i;
293     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294     while (nz--) {
295       idx   = 5*(*vi--);
296       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301       v -= 25;
302     }
303   }
304   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
305   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
307   PetscFunctionReturn(0);
308 }
309 
310 #undef __FUNCT__
311 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313 {
314   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315   PetscErrorCode ierr;
316   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317   PetscInt       *diag = a->diag,oidx;
318   MatScalar      *aa=a->a,*v;
319   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
320   PetscScalar    *x,*b;
321 
322   PetscFunctionBegin;
323   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
324   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
325   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326 
327   /* forward solve the U^T */
328   idx = 0;
329   for (i=0; i<n; i++) {
330 
331     v     = aa + 36*diag[i];
332     /* multiply by the inverse of the block diagonal */
333     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334     x6    = x[5+idx];
335     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341     v += 36;
342 
343     vi    = aj + diag[i] + 1;
344     nz    = ai[i+1] - diag[i] - 1;
345     while (nz--) {
346       oidx = 6*(*vi++);
347       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353       v  += 36;
354     }
355     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356     x[5+idx] = s6;
357     idx += 6;
358   }
359   /* backward solve the L^T */
360   for (i=n-1; i>=0; i--){
361     v    = aa + 36*diag[i] - 36;
362     vi   = aj + diag[i] - 1;
363     nz   = diag[i] - ai[i];
364     idt  = 6*i;
365     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366     s6 = x[5+idt];
367     while (nz--) {
368       idx   = 6*(*vi--);
369       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375       v -= 36;
376     }
377   }
378   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
379   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
381   PetscFunctionReturn(0);
382 }
383 
384 #undef __FUNCT__
385 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387 {
388   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389   PetscErrorCode ierr;
390   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391   PetscInt       *diag = a->diag,oidx;
392   MatScalar      *aa=a->a,*v;
393   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
394   PetscScalar    *x,*b;
395 
396   PetscFunctionBegin;
397   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
398   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
399   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400 
401   /* forward solve the U^T */
402   idx = 0;
403   for (i=0; i<n; i++) {
404 
405     v     = aa + 49*diag[i];
406     /* multiply by the inverse of the block diagonal */
407     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408     x6    = x[5+idx]; x7 = x[6+idx];
409     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416     v += 49;
417 
418     vi    = aj + diag[i] + 1;
419     nz    = ai[i+1] - diag[i] - 1;
420     while (nz--) {
421       oidx = 7*(*vi++);
422       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429       v  += 49;
430     }
431     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432     x[5+idx] = s6;x[6+idx] = s7;
433     idx += 7;
434   }
435   /* backward solve the L^T */
436   for (i=n-1; i>=0; i--){
437     v    = aa + 49*diag[i] - 49;
438     vi   = aj + diag[i] - 1;
439     nz   = diag[i] - ai[i];
440     idt  = 7*i;
441     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442     s6 = x[5+idt];s7 = x[6+idt];
443     while (nz--) {
444       idx   = 7*(*vi--);
445       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452       v -= 49;
453     }
454   }
455   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
458   PetscFunctionReturn(0);
459 }
460 
461 /*---------------------------------------------------------------------------------------------*/
462 #undef __FUNCT__
463 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465 {
466   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467   IS             iscol=a->col,isrow=a->row;
468   PetscErrorCode ierr;
469   const PetscInt *r,*c,*rout,*cout;
470   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471   PetscInt       *diag = a->diag;
472   MatScalar      *aa=a->a,*v;
473   PetscScalar    s1,*x,*b,*t;
474 
475   PetscFunctionBegin;
476   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
477   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478   t  = a->solve_work;
479 
480   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
482 
483   /* copy the b into temp work space according to permutation */
484   for (i=0; i<n; i++) {
485     t[i] = b[c[i]];
486   }
487 
488   /* forward solve the U^T */
489   for (i=0; i<n; i++) {
490 
491     v     = aa + diag[i];
492     /* multiply by the inverse of the block diagonal */
493     s1    = (*v++)*t[i];
494     vi    = aj + diag[i] + 1;
495     nz    = ai[i+1] - diag[i] - 1;
496     while (nz--) {
497       t[*vi++]  -= (*v++)*s1;
498     }
499     t[i]   = s1;
500   }
501   /* backward solve the L^T */
502   for (i=n-1; i>=0; i--){
503     v    = aa + diag[i] - 1;
504     vi   = aj + diag[i] - 1;
505     nz   = diag[i] - ai[i];
506     s1   = t[i];
507     while (nz--) {
508       t[*vi--]   -=  (*v--)*s1;
509     }
510   }
511 
512   /* copy t into x according to permutation */
513   for (i=0; i<n; i++) {
514     x[r[i]]   = t[i];
515   }
516 
517   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
518   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
519   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
520   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
522   PetscFunctionReturn(0);
523 }
524 
525 #undef __FUNCT__
526 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
527 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528 {
529   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530   IS             iscol=a->col,isrow=a->row;
531   PetscErrorCode ierr;
532   const PetscInt *r,*c,*rout,*cout;
533   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
535   MatScalar      *aa=a->a,*v;
536   PetscScalar    s1,s2,x1,x2;
537   PetscScalar    *x,*b,*t;
538 
539   PetscFunctionBegin;
540   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
541   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
542   t  = a->solve_work;
543 
544   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
545   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
546 
547   /* copy the b into temp work space according to permutation */
548   ii = 0;
549   for (i=0; i<n; i++) {
550     ic      = 2*c[i];
551     t[ii]   = b[ic];
552     t[ii+1] = b[ic+1];
553     ii += 2;
554   }
555 
556   /* forward solve the U^T */
557   idx = 0;
558   for (i=0; i<n; i++) {
559 
560     v     = aa + 4*diag[i];
561     /* multiply by the inverse of the block diagonal */
562     x1    = t[idx];   x2 = t[1+idx];
563     s1 = v[0]*x1  +  v[1]*x2;
564     s2 = v[2]*x1  +  v[3]*x2;
565     v += 4;
566 
567     vi    = aj + diag[i] + 1;
568     nz    = ai[i+1] - diag[i] - 1;
569     while (nz--) {
570       oidx = 2*(*vi++);
571       t[oidx]   -= v[0]*s1  +  v[1]*s2;
572       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
573       v  += 4;
574     }
575     t[idx]   = s1;t[1+idx] = s2;
576     idx += 2;
577   }
578   /* backward solve the L^T */
579   for (i=n-1; i>=0; i--){
580     v    = aa + 4*diag[i] - 4;
581     vi   = aj + diag[i] - 1;
582     nz   = diag[i] - ai[i];
583     idt  = 2*i;
584     s1 = t[idt];  s2 = t[1+idt];
585     while (nz--) {
586       idx   = 2*(*vi--);
587       t[idx]   -=  v[0]*s1 +  v[1]*s2;
588       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
589       v -= 4;
590     }
591   }
592 
593   /* copy t into x according to permutation */
594   ii = 0;
595   for (i=0; i<n; i++) {
596     ir      = 2*r[i];
597     x[ir]   = t[ii];
598     x[ir+1] = t[ii+1];
599     ii += 2;
600   }
601 
602   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
603   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
604   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
605   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
606   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
607   PetscFunctionReturn(0);
608 }
609 
610 #undef __FUNCT__
611 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
612 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613 {
614   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
615   IS             iscol=a->col,isrow=a->row;
616   PetscErrorCode ierr;
617   const PetscInt *r,*c,*rout,*cout;
618   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
620   MatScalar      *aa=a->a,*v;
621   PetscScalar    s1,s2,s3,x1,x2,x3;
622   PetscScalar    *x,*b,*t;
623 
624   PetscFunctionBegin;
625   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
626   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
627   t  = a->solve_work;
628 
629   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
630   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
631 
632   /* copy the b into temp work space according to permutation */
633   ii = 0;
634   for (i=0; i<n; i++) {
635     ic      = 3*c[i];
636     t[ii]   = b[ic];
637     t[ii+1] = b[ic+1];
638     t[ii+2] = b[ic+2];
639     ii += 3;
640   }
641 
642   /* forward solve the U^T */
643   idx = 0;
644   for (i=0; i<n; i++) {
645 
646     v     = aa + 9*diag[i];
647     /* multiply by the inverse of the block diagonal */
648     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
649     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
650     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
651     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
652     v += 9;
653 
654     vi    = aj + diag[i] + 1;
655     nz    = ai[i+1] - diag[i] - 1;
656     while (nz--) {
657       oidx = 3*(*vi++);
658       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
659       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
660       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661       v  += 9;
662     }
663     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
664     idx += 3;
665   }
666   /* backward solve the L^T */
667   for (i=n-1; i>=0; i--){
668     v    = aa + 9*diag[i] - 9;
669     vi   = aj + diag[i] - 1;
670     nz   = diag[i] - ai[i];
671     idt  = 3*i;
672     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
673     while (nz--) {
674       idx   = 3*(*vi--);
675       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
676       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
677       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678       v -= 9;
679     }
680   }
681 
682   /* copy t into x according to permutation */
683   ii = 0;
684   for (i=0; i<n; i++) {
685     ir      = 3*r[i];
686     x[ir]   = t[ii];
687     x[ir+1] = t[ii+1];
688     x[ir+2] = t[ii+2];
689     ii += 3;
690   }
691 
692   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
693   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
694   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
695   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
696   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
697   PetscFunctionReturn(0);
698 }
699 
700 #undef __FUNCT__
701 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
702 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703 {
704   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
705   IS             iscol=a->col,isrow=a->row;
706   PetscErrorCode ierr;
707   const PetscInt *r,*c,*rout,*cout;
708   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
710   MatScalar      *aa=a->a,*v;
711   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
712   PetscScalar    *x,*b,*t;
713 
714   PetscFunctionBegin;
715   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
716   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717   t  = a->solve_work;
718 
719   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
720   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
721 
722   /* copy the b into temp work space according to permutation */
723   ii = 0;
724   for (i=0; i<n; i++) {
725     ic      = 4*c[i];
726     t[ii]   = b[ic];
727     t[ii+1] = b[ic+1];
728     t[ii+2] = b[ic+2];
729     t[ii+3] = b[ic+3];
730     ii += 4;
731   }
732 
733   /* forward solve the U^T */
734   idx = 0;
735   for (i=0; i<n; i++) {
736 
737     v     = aa + 16*diag[i];
738     /* multiply by the inverse of the block diagonal */
739     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
740     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
741     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
742     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
743     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744     v += 16;
745 
746     vi    = aj + diag[i] + 1;
747     nz    = ai[i+1] - diag[i] - 1;
748     while (nz--) {
749       oidx = 4*(*vi++);
750       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
751       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
752       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754       v  += 16;
755     }
756     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757     idx += 4;
758   }
759   /* backward solve the L^T */
760   for (i=n-1; i>=0; i--){
761     v    = aa + 16*diag[i] - 16;
762     vi   = aj + diag[i] - 1;
763     nz   = diag[i] - ai[i];
764     idt  = 4*i;
765     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766     while (nz--) {
767       idx   = 4*(*vi--);
768       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
769       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
770       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772       v -= 16;
773     }
774   }
775 
776   /* copy t into x according to permutation */
777   ii = 0;
778   for (i=0; i<n; i++) {
779     ir      = 4*r[i];
780     x[ir]   = t[ii];
781     x[ir+1] = t[ii+1];
782     x[ir+2] = t[ii+2];
783     x[ir+3] = t[ii+3];
784     ii += 4;
785   }
786 
787   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
789   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
790   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
792   PetscFunctionReturn(0);
793 }
794 
795 #undef __FUNCT__
796 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
797 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798 {
799   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
800   IS             iscol=a->col,isrow=a->row;
801   PetscErrorCode ierr;
802   const PetscInt *r,*c,*rout,*cout;
803   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
805   MatScalar      *aa=a->a,*v;
806   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
807   PetscScalar    *x,*b,*t;
808 
809   PetscFunctionBegin;
810   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
811   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
812   t  = a->solve_work;
813 
814   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
815   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
816 
817   /* copy the b into temp work space according to permutation */
818   ii = 0;
819   for (i=0; i<n; i++) {
820     ic      = 5*c[i];
821     t[ii]   = b[ic];
822     t[ii+1] = b[ic+1];
823     t[ii+2] = b[ic+2];
824     t[ii+3] = b[ic+3];
825     t[ii+4] = b[ic+4];
826     ii += 5;
827   }
828 
829   /* forward solve the U^T */
830   idx = 0;
831   for (i=0; i<n; i++) {
832 
833     v     = aa + 25*diag[i];
834     /* multiply by the inverse of the block diagonal */
835     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
837     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
838     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841     v += 25;
842 
843     vi    = aj + diag[i] + 1;
844     nz    = ai[i+1] - diag[i] - 1;
845     while (nz--) {
846       oidx = 5*(*vi++);
847       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852       v  += 25;
853     }
854     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855     idx += 5;
856   }
857   /* backward solve the L^T */
858   for (i=n-1; i>=0; i--){
859     v    = aa + 25*diag[i] - 25;
860     vi   = aj + diag[i] - 1;
861     nz   = diag[i] - ai[i];
862     idt  = 5*i;
863     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864     while (nz--) {
865       idx   = 5*(*vi--);
866       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
867       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
868       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871       v -= 25;
872     }
873   }
874 
875   /* copy t into x according to permutation */
876   ii = 0;
877   for (i=0; i<n; i++) {
878     ir      = 5*r[i];
879     x[ir]   = t[ii];
880     x[ir+1] = t[ii+1];
881     x[ir+2] = t[ii+2];
882     x[ir+3] = t[ii+3];
883     x[ir+4] = t[ii+4];
884     ii += 5;
885   }
886 
887   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
888   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
889   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
890   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
891   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
892   PetscFunctionReturn(0);
893 }
894 
895 #undef __FUNCT__
896 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
897 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898 {
899   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
900   IS             iscol=a->col,isrow=a->row;
901   PetscErrorCode ierr;
902   const PetscInt *r,*c,*rout,*cout;
903   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
905   MatScalar      *aa=a->a,*v;
906   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
907   PetscScalar    *x,*b,*t;
908 
909   PetscFunctionBegin;
910   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
911   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
912   t  = a->solve_work;
913 
914   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
915   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
916 
917   /* copy the b into temp work space according to permutation */
918   ii = 0;
919   for (i=0; i<n; i++) {
920     ic      = 6*c[i];
921     t[ii]   = b[ic];
922     t[ii+1] = b[ic+1];
923     t[ii+2] = b[ic+2];
924     t[ii+3] = b[ic+3];
925     t[ii+4] = b[ic+4];
926     t[ii+5] = b[ic+5];
927     ii += 6;
928   }
929 
930   /* forward solve the U^T */
931   idx = 0;
932   for (i=0; i<n; i++) {
933 
934     v     = aa + 36*diag[i];
935     /* multiply by the inverse of the block diagonal */
936     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937     x6    = t[5+idx];
938     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
939     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
940     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944     v += 36;
945 
946     vi    = aj + diag[i] + 1;
947     nz    = ai[i+1] - diag[i] - 1;
948     while (nz--) {
949       oidx = 6*(*vi++);
950       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
951       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
952       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956       v  += 36;
957     }
958     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959     t[5+idx] = s6;
960     idx += 6;
961   }
962   /* backward solve the L^T */
963   for (i=n-1; i>=0; i--){
964     v    = aa + 36*diag[i] - 36;
965     vi   = aj + diag[i] - 1;
966     nz   = diag[i] - ai[i];
967     idt  = 6*i;
968     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969     s6 = t[5+idt];
970     while (nz--) {
971       idx   = 6*(*vi--);
972       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
973       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
974       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978       v -= 36;
979     }
980   }
981 
982   /* copy t into x according to permutation */
983   ii = 0;
984   for (i=0; i<n; i++) {
985     ir      = 6*r[i];
986     x[ir]   = t[ii];
987     x[ir+1] = t[ii+1];
988     x[ir+2] = t[ii+2];
989     x[ir+3] = t[ii+3];
990     x[ir+4] = t[ii+4];
991     x[ir+5] = t[ii+5];
992     ii += 6;
993   }
994 
995   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
996   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
997   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
998   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1000   PetscFunctionReturn(0);
1001 }
1002 
1003 #undef __FUNCT__
1004 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1005 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006 {
1007   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1008   IS             iscol=a->col,isrow=a->row;
1009   PetscErrorCode ierr;
1010   const PetscInt *r,*c,*rout,*cout;
1011   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1013   MatScalar      *aa=a->a,*v;
1014   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1015   PetscScalar    *x,*b,*t;
1016 
1017   PetscFunctionBegin;
1018   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1019   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1020   t  = a->solve_work;
1021 
1022   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1023   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1024 
1025   /* copy the b into temp work space according to permutation */
1026   ii = 0;
1027   for (i=0; i<n; i++) {
1028     ic      = 7*c[i];
1029     t[ii]   = b[ic];
1030     t[ii+1] = b[ic+1];
1031     t[ii+2] = b[ic+2];
1032     t[ii+3] = b[ic+3];
1033     t[ii+4] = b[ic+4];
1034     t[ii+5] = b[ic+5];
1035     t[ii+6] = b[ic+6];
1036     ii += 7;
1037   }
1038 
1039   /* forward solve the U^T */
1040   idx = 0;
1041   for (i=0; i<n; i++) {
1042 
1043     v     = aa + 49*diag[i];
1044     /* multiply by the inverse of the block diagonal */
1045     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046     x6    = t[5+idx]; x7 = t[6+idx];
1047     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1048     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054     v += 49;
1055 
1056     vi    = aj + diag[i] + 1;
1057     nz    = ai[i+1] - diag[i] - 1;
1058     while (nz--) {
1059       oidx = 7*(*vi++);
1060       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067       v  += 49;
1068     }
1069     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070     t[5+idx] = s6;t[6+idx] = s7;
1071     idx += 7;
1072   }
1073   /* backward solve the L^T */
1074   for (i=n-1; i>=0; i--){
1075     v    = aa + 49*diag[i] - 49;
1076     vi   = aj + diag[i] - 1;
1077     nz   = diag[i] - ai[i];
1078     idt  = 7*i;
1079     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080     s6 = t[5+idt];s7 = t[6+idt];
1081     while (nz--) {
1082       idx   = 7*(*vi--);
1083       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1084       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090       v -= 49;
1091     }
1092   }
1093 
1094   /* copy t into x according to permutation */
1095   ii = 0;
1096   for (i=0; i<n; i++) {
1097     ir      = 7*r[i];
1098     x[ir]   = t[ii];
1099     x[ir+1] = t[ii+1];
1100     x[ir+2] = t[ii+2];
1101     x[ir+3] = t[ii+3];
1102     x[ir+4] = t[ii+4];
1103     x[ir+5] = t[ii+5];
1104     x[ir+6] = t[ii+6];
1105     ii += 7;
1106   }
1107 
1108   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1109   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1110   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1111   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1112   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1113   PetscFunctionReturn(0);
1114 }
1115 
1116 /* ----------------------------------------------------------- */
1117 #undef __FUNCT__
1118 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1119 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1120 {
1121   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1122   IS             iscol=a->col,isrow=a->row;
1123   PetscErrorCode ierr;
1124   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1125   PetscInt       i,n=a->mbs;
1126   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1127   MatScalar      *aa=a->a,*v;
1128   PetscScalar    *x,*b,*s,*t,*ls;
1129 
1130   PetscFunctionBegin;
1131   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1132   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133   t  = a->solve_work;
1134 
1135   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1136   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1137 
1138   /* forward solve the lower triangular */
1139   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1140   for (i=1; i<n; i++) {
1141     v   = aa + bs2*ai[i];
1142     vi  = aj + ai[i];
1143     nz  = a->diag[i] - ai[i];
1144     s = t + bs*i;
1145     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1146     while (nz--) {
1147       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1148       v += bs2;
1149     }
1150   }
1151   /* backward solve the upper triangular */
1152   ls = a->solve_work + A->cmap->n;
1153   for (i=n-1; i>=0; i--){
1154     v   = aa + bs2*(a->diag[i] + 1);
1155     vi  = aj + a->diag[i] + 1;
1156     nz  = ai[i+1] - a->diag[i] - 1;
1157     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1158     while (nz--) {
1159       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1160       v += bs2;
1161     }
1162     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1163     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1164   }
1165 
1166   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 #undef __FUNCT__
1175 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1176 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1177 {
1178   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1179   IS             iscol=a->col,isrow=a->row;
1180   PetscErrorCode ierr;
1181   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1182   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1183   MatScalar      *aa=a->a,*v;
1184   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1185   PetscScalar    *x,*b,*t;
1186 
1187   PetscFunctionBegin;
1188   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1189   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1190   t  = a->solve_work;
1191 
1192   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1193   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1194 
1195   /* forward solve the lower triangular */
1196   idx    = 7*(*r++);
1197   t[0] = b[idx];   t[1] = b[1+idx];
1198   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199   t[5] = b[5+idx]; t[6] = b[6+idx];
1200 
1201   for (i=1; i<n; i++) {
1202     v     = aa + 49*ai[i];
1203     vi    = aj + ai[i];
1204     nz    = diag[i] - ai[i];
1205     idx   = 7*(*r++);
1206     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1208     while (nz--) {
1209       idx   = 7*(*vi++);
1210       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1211       x4    = t[3+idx];x5 = t[4+idx];
1212       x6    = t[5+idx];x7 = t[6+idx];
1213       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1220       v += 49;
1221     }
1222     idx = 7*i;
1223     t[idx]   = s1;t[1+idx] = s2;
1224     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225     t[5+idx] = s6;t[6+idx] = s7;
1226   }
1227   /* backward solve the upper triangular */
1228   for (i=n-1; i>=0; i--){
1229     v    = aa + 49*diag[i] + 49;
1230     vi   = aj + diag[i] + 1;
1231     nz   = ai[i+1] - diag[i] - 1;
1232     idt  = 7*i;
1233     s1 = t[idt];  s2 = t[1+idt];
1234     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235     s6 = t[5+idt];s7 = t[6+idt];
1236     while (nz--) {
1237       idx   = 7*(*vi++);
1238       x1    = t[idx];   x2 = t[1+idx];
1239       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240       x6    = t[5+idx]; x7 = t[6+idx];
1241       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1248       v += 49;
1249     }
1250     idc = 7*(*c--);
1251     v   = aa + 49*diag[i];
1252     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1253                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1266   }
1267 
1268   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1269   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1270   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1271   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1272   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1273   PetscFunctionReturn(0);
1274 }
1275 
1276 #undef __FUNCT__
1277 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1278 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1279 {
1280   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1281   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282   PetscErrorCode    ierr;
1283   PetscInt          *diag = a->diag,jdx;
1284   const MatScalar   *aa=a->a,*v;
1285   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286   const PetscScalar *b;
1287 
1288   PetscFunctionBegin;
1289   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1290   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1291   /* forward solve the lower triangular */
1292   idx    = 0;
1293   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1294   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1295   x[6] = b[6+idx];
1296   for (i=1; i<n; i++) {
1297     v     =  aa + 49*ai[i];
1298     vi    =  aj + ai[i];
1299     nz    =  diag[i] - ai[i];
1300     idx   =  7*i;
1301     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1302     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303     s7  =  b[6+idx];
1304     while (nz--) {
1305       jdx   = 7*(*vi++);
1306       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1307       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1308       x7    = x[6+jdx];
1309       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1316       v += 49;
1317      }
1318     x[idx]   = s1;
1319     x[1+idx] = s2;
1320     x[2+idx] = s3;
1321     x[3+idx] = s4;
1322     x[4+idx] = s5;
1323     x[5+idx] = s6;
1324     x[6+idx] = s7;
1325   }
1326   /* backward solve the upper triangular */
1327   for (i=n-1; i>=0; i--){
1328     v    = aa + 49*diag[i] + 49;
1329     vi   = aj + diag[i] + 1;
1330     nz   = ai[i+1] - diag[i] - 1;
1331     idt  = 7*i;
1332     s1 = x[idt];   s2 = x[1+idt];
1333     s3 = x[2+idt]; s4 = x[3+idt];
1334     s5 = x[4+idt]; s6 = x[5+idt];
1335     s7 = x[6+idt];
1336     while (nz--) {
1337       idx   = 7*(*vi++);
1338       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1339       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1340       x7    = x[6+idx];
1341       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1348       v += 49;
1349     }
1350     v        = aa + 49*diag[i];
1351     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1352                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1354                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1356                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1358                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1360                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1362                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1364                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1365   }
1366 
1367   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1368   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1369   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1370   PetscFunctionReturn(0);
1371 }
1372 
1373 #undef __FUNCT__
1374 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1375 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1376 {
1377     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1378     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1379     PetscErrorCode    ierr;
1380     PetscInt          idx,jdx,idt;
1381     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1382     const MatScalar   *aa=a->a,*v;
1383     PetscScalar       *x;
1384     const PetscScalar *b;
1385     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1386 
1387     PetscFunctionBegin;
1388     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1389     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1390     /* forward solve the lower triangular */
1391     idx    = 0;
1392     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1393     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1394     for (i=1; i<n; i++) {
1395        v    = aa + bs2*ai[i];
1396        vi   = aj + ai[i];
1397        nz   = ai[i+1] - ai[i];
1398       idx   = bs*i;
1399        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1400        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1401        while (nz--) {
1402           jdx   = bs*(*vi++);
1403           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1404 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1405           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1406           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1407           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1408 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1409           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1410 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1411 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1412           v   +=  bs2;
1413         }
1414 
1415        x[idx]   = s1;
1416        x[1+idx] = s2;
1417        x[2+idx] = s3;
1418        x[3+idx] = s4;
1419        x[4+idx] = s5;
1420        x[5+idx] = s6;
1421        x[6+idx] = s7;
1422     }
1423 
1424    /* backward solve the upper triangular */
1425   for (i=n-1; i>=0; i--){
1426      v   = aa + bs2*ai[2*n-i];
1427      vi  = aj + ai[2*n-i];
1428      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1429      idt = bs*i;
1430      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1431      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1432     while (nz--) {
1433       idx   = bs*(*vi++);
1434        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1435        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1436        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1437        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1438        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1439        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1440        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1441        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1442        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1443         v   +=  bs2;
1444     }
1445     /* x = inv_diagonal*x */
1446     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1447     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1448     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1449     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1450     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1451     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1452     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1453   }
1454 
1455   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1456   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1457   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1458   PetscFunctionReturn(0);
1459 }
1460 
1461 #undef __FUNCT__
1462 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1463 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1464 {
1465   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1466   IS                iscol=a->col,isrow=a->row;
1467   PetscErrorCode    ierr;
1468   const PetscInt    *r,*c,*rout,*cout;
1469   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1470   const MatScalar   *aa=a->a,*v;
1471   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1472   const PetscScalar *b;
1473   PetscFunctionBegin;
1474   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1475   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1476   t  = a->solve_work;
1477 
1478   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1479   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1480 
1481   /* forward solve the lower triangular */
1482   idx    = 6*(*r++);
1483   t[0] = b[idx];   t[1] = b[1+idx];
1484   t[2] = b[2+idx]; t[3] = b[3+idx];
1485   t[4] = b[4+idx]; t[5] = b[5+idx];
1486   for (i=1; i<n; i++) {
1487     v     = aa + 36*ai[i];
1488     vi    = aj + ai[i];
1489     nz    = diag[i] - ai[i];
1490     idx   = 6*(*r++);
1491     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1492     s5  = b[4+idx]; s6 = b[5+idx];
1493     while (nz--) {
1494       idx   = 6*(*vi++);
1495       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1496       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1497       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1503       v += 36;
1504     }
1505     idx = 6*i;
1506     t[idx]   = s1;t[1+idx] = s2;
1507     t[2+idx] = s3;t[3+idx] = s4;
1508     t[4+idx] = s5;t[5+idx] = s6;
1509   }
1510   /* backward solve the upper triangular */
1511   for (i=n-1; i>=0; i--){
1512     v    = aa + 36*diag[i] + 36;
1513     vi   = aj + diag[i] + 1;
1514     nz   = ai[i+1] - diag[i] - 1;
1515     idt  = 6*i;
1516     s1 = t[idt];  s2 = t[1+idt];
1517     s3 = t[2+idt];s4 = t[3+idt];
1518     s5 = t[4+idt];s6 = t[5+idt];
1519     while (nz--) {
1520       idx   = 6*(*vi++);
1521       x1    = t[idx];   x2 = t[1+idx];
1522       x3    = t[2+idx]; x4 = t[3+idx];
1523       x5    = t[4+idx]; x6 = t[5+idx];
1524       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1525       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1526       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1527       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1528       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1529       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1530       v += 36;
1531     }
1532     idc = 6*(*c--);
1533     v   = aa + 36*diag[i];
1534     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1535                                  v[18]*s4+v[24]*s5+v[30]*s6;
1536     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1537                                  v[19]*s4+v[25]*s5+v[31]*s6;
1538     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1539                                  v[20]*s4+v[26]*s5+v[32]*s6;
1540     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1541                                  v[21]*s4+v[27]*s5+v[33]*s6;
1542     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1543                                  v[22]*s4+v[28]*s5+v[34]*s6;
1544     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1545                                  v[23]*s4+v[29]*s5+v[35]*s6;
1546   }
1547 
1548   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1549   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1550   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1551   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1552   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1553   PetscFunctionReturn(0);
1554 }
1555 
1556 #undef __FUNCT__
1557 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1558 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1559 {
1560   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1561   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1562   PetscErrorCode    ierr;
1563   PetscInt          *diag = a->diag,jdx;
1564   const MatScalar   *aa=a->a,*v;
1565   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1566   const PetscScalar *b;
1567 
1568   PetscFunctionBegin;
1569   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1570   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1571   /* forward solve the lower triangular */
1572   idx    = 0;
1573   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1574   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1575   for (i=1; i<n; i++) {
1576     v     =  aa + 36*ai[i];
1577     vi    =  aj + ai[i];
1578     nz    =  diag[i] - ai[i];
1579     idx   =  6*i;
1580     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1581     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1582     while (nz--) {
1583       jdx   = 6*(*vi++);
1584       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1585       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1586       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1587       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1588       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1589       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1590       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1591       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1592       v += 36;
1593      }
1594     x[idx]   = s1;
1595     x[1+idx] = s2;
1596     x[2+idx] = s3;
1597     x[3+idx] = s4;
1598     x[4+idx] = s5;
1599     x[5+idx] = s6;
1600   }
1601   /* backward solve the upper triangular */
1602   for (i=n-1; i>=0; i--){
1603     v    = aa + 36*diag[i] + 36;
1604     vi   = aj + diag[i] + 1;
1605     nz   = ai[i+1] - diag[i] - 1;
1606     idt  = 6*i;
1607     s1 = x[idt];   s2 = x[1+idt];
1608     s3 = x[2+idt]; s4 = x[3+idt];
1609     s5 = x[4+idt]; s6 = x[5+idt];
1610     while (nz--) {
1611       idx   = 6*(*vi++);
1612       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1613       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1614       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1615       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1616       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1617       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1618       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1619       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1620       v += 36;
1621     }
1622     v        = aa + 36*diag[i];
1623     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1624     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1625     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1626     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1627     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1628     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1629   }
1630 
1631   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1632   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1634   PetscFunctionReturn(0);
1635 }
1636 
1637 #undef __FUNCT__
1638 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1639 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1640 {
1641     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1642     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1643     PetscErrorCode    ierr;
1644     PetscInt          idx,jdx,idt;
1645     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1646     const MatScalar   *aa=a->a,*v;
1647     PetscScalar       *x;
1648     const PetscScalar *b;
1649     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1650 
1651     PetscFunctionBegin;
1652     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1653     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1654     /* forward solve the lower triangular */
1655     idx    = 0;
1656     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1657     x[4] = b[4+idx];x[5] = b[5+idx];
1658     for (i=1; i<n; i++) {
1659        v    = aa + bs2*ai[i];
1660        vi   = aj + ai[i];
1661        nz   = ai[i+1] - ai[i];
1662       idx   = bs*i;
1663        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1664        s5   = b[4+idx];s6 = b[5+idx];
1665        while (nz--) {
1666           jdx   = bs*(*vi++);
1667           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1668 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1669           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1670           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1671           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1672 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1673           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1674 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1675           v   +=  bs2;
1676         }
1677 
1678        x[idx]   = s1;
1679        x[1+idx] = s2;
1680        x[2+idx] = s3;
1681        x[3+idx] = s4;
1682        x[4+idx] = s5;
1683        x[5+idx] = s6;
1684     }
1685 
1686    /* backward solve the upper triangular */
1687   for (i=n-1; i>=0; i--){
1688      v   = aa + bs2*ai[2*n-i];
1689      vi  = aj + ai[2*n-i];
1690      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1691      idt = bs*i;
1692      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1693      s5 = x[4+idt];s6 = x[5+idt];
1694     while (nz--) {
1695       idx   = bs*(*vi++);
1696        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1697        x5    = x[4+idx];x6 = x[5+idx];
1698        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1699        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1700        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1701        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1702        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1703        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1704         v   +=  bs2;
1705     }
1706     /* x = inv_diagonal*x */
1707    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1708    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1709    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1710    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1711    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1712    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1713   }
1714 
1715   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1716   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1717   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1718   PetscFunctionReturn(0);
1719 }
1720 #undef __FUNCT__
1721 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1722 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1723 {
1724   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1725   IS                iscol=a->col,isrow=a->row;
1726   PetscErrorCode    ierr;
1727   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1728   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1729   const MatScalar   *aa=a->a,*v;
1730   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1731   const PetscScalar *b;
1732 
1733   PetscFunctionBegin;
1734   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1735   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1736   t  = a->solve_work;
1737 
1738   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1739   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1740 
1741   /* forward solve the lower triangular */
1742   idx    = 5*(*r++);
1743   t[0] = b[idx];   t[1] = b[1+idx];
1744   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1745   for (i=1; i<n; i++) {
1746     v     = aa + 25*ai[i];
1747     vi    = aj + ai[i];
1748     nz    = diag[i] - ai[i];
1749     idx   = 5*(*r++);
1750     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1751     s5  = b[4+idx];
1752     while (nz--) {
1753       idx   = 5*(*vi++);
1754       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1755       x4    = t[3+idx];x5 = t[4+idx];
1756       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1757       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1758       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1759       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1760       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1761       v += 25;
1762     }
1763     idx = 5*i;
1764     t[idx]   = s1;t[1+idx] = s2;
1765     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1766   }
1767   /* backward solve the upper triangular */
1768   for (i=n-1; i>=0; i--){
1769     v    = aa + 25*diag[i] + 25;
1770     vi   = aj + diag[i] + 1;
1771     nz   = ai[i+1] - diag[i] - 1;
1772     idt  = 5*i;
1773     s1 = t[idt];  s2 = t[1+idt];
1774     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1775     while (nz--) {
1776       idx   = 5*(*vi++);
1777       x1    = t[idx];   x2 = t[1+idx];
1778       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1779       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1780       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1781       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1782       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1783       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1784       v += 25;
1785     }
1786     idc = 5*(*c--);
1787     v   = aa + 25*diag[i];
1788     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1789                                  v[15]*s4+v[20]*s5;
1790     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1791                                  v[16]*s4+v[21]*s5;
1792     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1793                                  v[17]*s4+v[22]*s5;
1794     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1795                                  v[18]*s4+v[23]*s5;
1796     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1797                                  v[19]*s4+v[24]*s5;
1798   }
1799 
1800   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1801   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1802   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1803   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1804   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1805   PetscFunctionReturn(0);
1806 }
1807 
1808 #undef __FUNCT__
1809 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1810 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1811 {
1812   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1813   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1814   PetscErrorCode    ierr;
1815   PetscInt          *diag = a->diag,jdx;
1816   const MatScalar   *aa=a->a,*v;
1817   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1818   const PetscScalar *b;
1819 
1820   PetscFunctionBegin;
1821   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1822   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1823   /* forward solve the lower triangular */
1824   idx    = 0;
1825   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1826   for (i=1; i<n; i++) {
1827     v     =  aa + 25*ai[i];
1828     vi    =  aj + ai[i];
1829     nz    =  diag[i] - ai[i];
1830     idx   =  5*i;
1831     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1832     while (nz--) {
1833       jdx   = 5*(*vi++);
1834       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1835       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1836       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1837       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1838       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1839       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1840       v    += 25;
1841     }
1842     x[idx]   = s1;
1843     x[1+idx] = s2;
1844     x[2+idx] = s3;
1845     x[3+idx] = s4;
1846     x[4+idx] = s5;
1847   }
1848   /* backward solve the upper triangular */
1849   for (i=n-1; i>=0; i--){
1850     v    = aa + 25*diag[i] + 25;
1851     vi   = aj + diag[i] + 1;
1852     nz   = ai[i+1] - diag[i] - 1;
1853     idt  = 5*i;
1854     s1 = x[idt];  s2 = x[1+idt];
1855     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1856     while (nz--) {
1857       idx   = 5*(*vi++);
1858       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1859       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1860       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1861       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1862       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1863       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1864       v    += 25;
1865     }
1866     v        = aa + 25*diag[i];
1867     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1868     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1869     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1870     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1871     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1872   }
1873 
1874   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1875   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1876   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1877   PetscFunctionReturn(0);
1878 }
1879 
1880 #undef __FUNCT__
1881 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
1882 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1883 {
1884   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1885   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1886   PetscErrorCode    ierr;
1887   PetscInt          jdx;
1888   const MatScalar   *aa=a->a,*v;
1889   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1890   const PetscScalar *b;
1891 
1892   PetscFunctionBegin;
1893   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1894   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1895   /* forward solve the lower triangular */
1896   idx    = 0;
1897   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1898   for (i=1; i<n; i++) {
1899     v   = aa + 25*ai[i];
1900     vi  = aj + ai[i];
1901     nz  = ai[i+1] - ai[i];
1902     idx = 5*i;
1903     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1904     while (nz--) {
1905       jdx   = 5*(*vi++);
1906       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1907       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1908       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1909       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1910       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1911       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1912       v    += 25;
1913     }
1914     x[idx]   = s1;
1915     x[1+idx] = s2;
1916     x[2+idx] = s3;
1917     x[3+idx] = s4;
1918     x[4+idx] = s5;
1919   }
1920 
1921   /* backward solve the upper triangular */
1922   for (i=n-1; i>=0; i--){
1923     v   = aa + 25*ai[2*n-i];
1924     vi  = aj + ai[2*n-i];
1925     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1926     idt = 5*i;
1927     s1 = x[idt];  s2 = x[1+idt];
1928     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1929     while (nz--) {
1930       idx   = 5*(*vi++);
1931       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1932       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1933       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1934       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1935       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1936       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1937       v    += 25;
1938     }
1939     /* x = inv_diagonal*x */
1940     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1941     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1942     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1943     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1944     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1945   }
1946 
1947   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1948   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1949   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1950   PetscFunctionReturn(0);
1951 }
1952 
1953 #undef __FUNCT__
1954 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1955 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1956 {
1957   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1958   IS                iscol=a->col,isrow=a->row;
1959   PetscErrorCode    ierr;
1960   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1961   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1962   const MatScalar   *aa=a->a,*v;
1963   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1964   const PetscScalar *b;
1965 
1966   PetscFunctionBegin;
1967   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1968   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1969   t  = a->solve_work;
1970 
1971   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1972   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1973 
1974   /* forward solve the lower triangular */
1975   idx    = 4*(*r++);
1976   t[0] = b[idx];   t[1] = b[1+idx];
1977   t[2] = b[2+idx]; t[3] = b[3+idx];
1978   for (i=1; i<n; i++) {
1979     v     = aa + 16*ai[i];
1980     vi    = aj + ai[i];
1981     nz    = diag[i] - ai[i];
1982     idx   = 4*(*r++);
1983     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1984     while (nz--) {
1985       idx   = 4*(*vi++);
1986       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1987       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1988       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1989       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1990       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1991       v    += 16;
1992     }
1993     idx        = 4*i;
1994     t[idx]   = s1;t[1+idx] = s2;
1995     t[2+idx] = s3;t[3+idx] = s4;
1996   }
1997   /* backward solve the upper triangular */
1998   for (i=n-1; i>=0; i--){
1999     v    = aa + 16*diag[i] + 16;
2000     vi   = aj + diag[i] + 1;
2001     nz   = ai[i+1] - diag[i] - 1;
2002     idt  = 4*i;
2003     s1 = t[idt];  s2 = t[1+idt];
2004     s3 = t[2+idt];s4 = t[3+idt];
2005     while (nz--) {
2006       idx   = 4*(*vi++);
2007       x1    = t[idx];   x2 = t[1+idx];
2008       x3    = t[2+idx]; x4 = t[3+idx];
2009       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2010       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2011       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2012       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2013       v += 16;
2014     }
2015     idc      = 4*(*c--);
2016     v        = aa + 16*diag[i];
2017     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2018     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2019     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2020     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2021   }
2022 
2023   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2024   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2025   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2026   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2027   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2028   PetscFunctionReturn(0);
2029 }
2030 
2031 #undef __FUNCT__
2032 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2033 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2034 {
2035   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2036   IS                iscol=a->col,isrow=a->row;
2037   PetscErrorCode    ierr;
2038   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2039   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2040   const MatScalar   *aa=a->a,*v;
2041   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2042   PetscScalar       *x;
2043   const PetscScalar *b;
2044 
2045   PetscFunctionBegin;
2046   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2047   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2048   t  = (MatScalar *)a->solve_work;
2049 
2050   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2051   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2052 
2053   /* forward solve the lower triangular */
2054   idx    = 4*(*r++);
2055   t[0] = (MatScalar)b[idx];
2056   t[1] = (MatScalar)b[1+idx];
2057   t[2] = (MatScalar)b[2+idx];
2058   t[3] = (MatScalar)b[3+idx];
2059   for (i=1; i<n; i++) {
2060     v     = aa + 16*ai[i];
2061     vi    = aj + ai[i];
2062     nz    = diag[i] - ai[i];
2063     idx   = 4*(*r++);
2064     s1 = (MatScalar)b[idx];
2065     s2 = (MatScalar)b[1+idx];
2066     s3 = (MatScalar)b[2+idx];
2067     s4 = (MatScalar)b[3+idx];
2068     while (nz--) {
2069       idx   = 4*(*vi++);
2070       x1  = t[idx];
2071       x2  = t[1+idx];
2072       x3  = t[2+idx];
2073       x4  = t[3+idx];
2074       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2075       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2076       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2077       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2078       v    += 16;
2079     }
2080     idx        = 4*i;
2081     t[idx]   = s1;
2082     t[1+idx] = s2;
2083     t[2+idx] = s3;
2084     t[3+idx] = s4;
2085   }
2086   /* backward solve the upper triangular */
2087   for (i=n-1; i>=0; i--){
2088     v    = aa + 16*diag[i] + 16;
2089     vi   = aj + diag[i] + 1;
2090     nz   = ai[i+1] - diag[i] - 1;
2091     idt  = 4*i;
2092     s1 = t[idt];
2093     s2 = t[1+idt];
2094     s3 = t[2+idt];
2095     s4 = t[3+idt];
2096     while (nz--) {
2097       idx   = 4*(*vi++);
2098       x1  = t[idx];
2099       x2  = t[1+idx];
2100       x3  = t[2+idx];
2101       x4  = t[3+idx];
2102       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2103       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2104       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2105       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2106       v += 16;
2107     }
2108     idc      = 4*(*c--);
2109     v        = aa + 16*diag[i];
2110     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2111     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2112     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2113     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2114     x[idc]   = (PetscScalar)t[idt];
2115     x[1+idc] = (PetscScalar)t[1+idt];
2116     x[2+idc] = (PetscScalar)t[2+idt];
2117     x[3+idc] = (PetscScalar)t[3+idt];
2118  }
2119 
2120   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2121   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2122   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2123   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2124   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2125   PetscFunctionReturn(0);
2126 }
2127 
2128 #if defined (PETSC_HAVE_SSE)
2129 
2130 #include PETSC_HAVE_SSE
2131 
2132 #undef __FUNCT__
2133 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2134 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
2135 {
2136   /*
2137      Note: This code uses demotion of double
2138      to float when performing the mixed-mode computation.
2139      This may not be numerically reasonable for all applications.
2140   */
2141   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2142   IS             iscol=a->col,isrow=a->row;
2143   PetscErrorCode ierr;
2144   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
2145   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2146   MatScalar      *aa=a->a,*v;
2147   PetscScalar    *x,*b,*t;
2148 
2149   /* Make space in temp stack for 16 Byte Aligned arrays */
2150   float           ssealignedspace[11],*tmps,*tmpx;
2151   unsigned long   offset;
2152 
2153   PetscFunctionBegin;
2154   SSE_SCOPE_BEGIN;
2155 
2156     offset = (unsigned long)ssealignedspace % 16;
2157     if (offset) offset = (16 - offset)/4;
2158     tmps = &ssealignedspace[offset];
2159     tmpx = &ssealignedspace[offset+4];
2160     PREFETCH_NTA(aa+16*ai[1]);
2161 
2162     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2163     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2164     t  = a->solve_work;
2165 
2166     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2167     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2168 
2169     /* forward solve the lower triangular */
2170     idx  = 4*(*r++);
2171     t[0] = b[idx];   t[1] = b[1+idx];
2172     t[2] = b[2+idx]; t[3] = b[3+idx];
2173     v    =  aa + 16*ai[1];
2174 
2175     for (i=1; i<n;) {
2176       PREFETCH_NTA(&v[8]);
2177       vi   =  aj      + ai[i];
2178       nz   =  diag[i] - ai[i];
2179       idx  =  4*(*r++);
2180 
2181       /* Demote sum from double to float */
2182       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2183       LOAD_PS(tmps,XMM7);
2184 
2185       while (nz--) {
2186         PREFETCH_NTA(&v[16]);
2187         idx = 4*(*vi++);
2188 
2189         /* Demote solution (so far) from double to float */
2190         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2191 
2192         /* 4x4 Matrix-Vector product with negative accumulation: */
2193         SSE_INLINE_BEGIN_2(tmpx,v)
2194           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2195 
2196           /* First Column */
2197           SSE_COPY_PS(XMM0,XMM6)
2198           SSE_SHUFFLE(XMM0,XMM0,0x00)
2199           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2200           SSE_SUB_PS(XMM7,XMM0)
2201 
2202           /* Second Column */
2203           SSE_COPY_PS(XMM1,XMM6)
2204           SSE_SHUFFLE(XMM1,XMM1,0x55)
2205           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2206           SSE_SUB_PS(XMM7,XMM1)
2207 
2208           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2209 
2210           /* Third Column */
2211           SSE_COPY_PS(XMM2,XMM6)
2212           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2213           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2214           SSE_SUB_PS(XMM7,XMM2)
2215 
2216           /* Fourth Column */
2217           SSE_COPY_PS(XMM3,XMM6)
2218           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2219           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2220           SSE_SUB_PS(XMM7,XMM3)
2221         SSE_INLINE_END_2
2222 
2223         v  += 16;
2224       }
2225       idx = 4*i;
2226       v   = aa + 16*ai[++i];
2227       PREFETCH_NTA(v);
2228       STORE_PS(tmps,XMM7);
2229 
2230       /* Promote result from float to double */
2231       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
2232     }
2233     /* backward solve the upper triangular */
2234     idt  = 4*(n-1);
2235     ai16 = 16*diag[n-1];
2236     v    = aa + ai16 + 16;
2237     for (i=n-1; i>=0;){
2238       PREFETCH_NTA(&v[8]);
2239       vi = aj + diag[i] + 1;
2240       nz = ai[i+1] - diag[i] - 1;
2241 
2242       /* Demote accumulator from double to float */
2243       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2244       LOAD_PS(tmps,XMM7);
2245 
2246       while (nz--) {
2247         PREFETCH_NTA(&v[16]);
2248         idx = 4*(*vi++);
2249 
2250         /* Demote solution (so far) from double to float */
2251         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2252 
2253         /* 4x4 Matrix-Vector Product with negative accumulation: */
2254         SSE_INLINE_BEGIN_2(tmpx,v)
2255           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2256 
2257           /* First Column */
2258           SSE_COPY_PS(XMM0,XMM6)
2259           SSE_SHUFFLE(XMM0,XMM0,0x00)
2260           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2261           SSE_SUB_PS(XMM7,XMM0)
2262 
2263           /* Second Column */
2264           SSE_COPY_PS(XMM1,XMM6)
2265           SSE_SHUFFLE(XMM1,XMM1,0x55)
2266           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2267           SSE_SUB_PS(XMM7,XMM1)
2268 
2269           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2270 
2271           /* Third Column */
2272           SSE_COPY_PS(XMM2,XMM6)
2273           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2274           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2275           SSE_SUB_PS(XMM7,XMM2)
2276 
2277           /* Fourth Column */
2278           SSE_COPY_PS(XMM3,XMM6)
2279           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2280           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2281           SSE_SUB_PS(XMM7,XMM3)
2282         SSE_INLINE_END_2
2283         v  += 16;
2284       }
2285       v    = aa + ai16;
2286       ai16 = 16*diag[--i];
2287       PREFETCH_NTA(aa+ai16+16);
2288       /*
2289          Scale the result by the diagonal 4x4 block,
2290          which was inverted as part of the factorization
2291       */
2292       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2293         /* First Column */
2294         SSE_COPY_PS(XMM0,XMM7)
2295         SSE_SHUFFLE(XMM0,XMM0,0x00)
2296         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2297 
2298         /* Second Column */
2299         SSE_COPY_PS(XMM1,XMM7)
2300         SSE_SHUFFLE(XMM1,XMM1,0x55)
2301         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2302         SSE_ADD_PS(XMM0,XMM1)
2303 
2304         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2305 
2306         /* Third Column */
2307         SSE_COPY_PS(XMM2,XMM7)
2308         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2309         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2310         SSE_ADD_PS(XMM0,XMM2)
2311 
2312         /* Fourth Column */
2313         SSE_COPY_PS(XMM3,XMM7)
2314         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2315         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2316         SSE_ADD_PS(XMM0,XMM3)
2317 
2318         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2319       SSE_INLINE_END_3
2320 
2321       /* Promote solution from float to double */
2322       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2323 
2324       /* Apply reordering to t and stream into x.    */
2325       /* This way, x doesn't pollute the cache.      */
2326       /* Be careful with size: 2 doubles = 4 floats! */
2327       idc  = 4*(*c--);
2328       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2329         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2330         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2331         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2332         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2333         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2334         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2335       SSE_INLINE_END_2
2336       v    = aa + ai16 + 16;
2337       idt -= 4;
2338     }
2339 
2340     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2341     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2342     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2343     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2344     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2345   SSE_SCOPE_END;
2346   PetscFunctionReturn(0);
2347 }
2348 
2349 #endif
2350 
2351 
2352 /*
2353       Special case where the matrix was ILU(0) factored in the natural
2354    ordering. This eliminates the need for the column and row permutation.
2355 */
2356 #undef __FUNCT__
2357 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2358 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2359 {
2360   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2361   PetscInt          n=a->mbs;
2362   const PetscInt    *ai=a->i,*aj=a->j;
2363   PetscErrorCode    ierr;
2364   const PetscInt    *diag = a->diag;
2365   const MatScalar   *aa=a->a;
2366   PetscScalar       *x;
2367   const PetscScalar *b;
2368 
2369   PetscFunctionBegin;
2370   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2371   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2372 
2373 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2374   {
2375     static PetscScalar w[2000]; /* very BAD need to fix */
2376     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2377   }
2378 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2379   {
2380     static PetscScalar w[2000]; /* very BAD need to fix */
2381     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2382   }
2383 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2384   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2385 #else
2386   {
2387     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2388     const MatScalar *v;
2389     PetscInt        jdx,idt,idx,nz,i,ai16;
2390     const PetscInt  *vi;
2391 
2392   /* forward solve the lower triangular */
2393   idx    = 0;
2394   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2395   for (i=1; i<n; i++) {
2396     v     =  aa      + 16*ai[i];
2397     vi    =  aj      + ai[i];
2398     nz    =  diag[i] - ai[i];
2399     idx   +=  4;
2400     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2401     while (nz--) {
2402       jdx   = 4*(*vi++);
2403       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2404       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2405       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2406       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2407       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2408       v    += 16;
2409     }
2410     x[idx]   = s1;
2411     x[1+idx] = s2;
2412     x[2+idx] = s3;
2413     x[3+idx] = s4;
2414   }
2415   /* backward solve the upper triangular */
2416   idt = 4*(n-1);
2417   for (i=n-1; i>=0; i--){
2418     ai16 = 16*diag[i];
2419     v    = aa + ai16 + 16;
2420     vi   = aj + diag[i] + 1;
2421     nz   = ai[i+1] - diag[i] - 1;
2422     s1 = x[idt];  s2 = x[1+idt];
2423     s3 = x[2+idt];s4 = x[3+idt];
2424     while (nz--) {
2425       idx   = 4*(*vi++);
2426       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2427       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2428       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2429       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2430       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2431       v    += 16;
2432     }
2433     v        = aa + ai16;
2434     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2435     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2436     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2437     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2438     idt -= 4;
2439   }
2440   }
2441 #endif
2442 
2443   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2444   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2445   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2446   PetscFunctionReturn(0);
2447 }
2448 
2449 #undef __FUNCT__
2450 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2451 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2452 {
2453     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2454     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2455     PetscErrorCode    ierr;
2456     PetscInt          idx,jdx,idt;
2457     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2458     const MatScalar   *aa=a->a,*v;
2459     PetscScalar       *x;
2460     const PetscScalar *b;
2461     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2462 
2463     PetscFunctionBegin;
2464     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2465     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2466     /* forward solve the lower triangular */
2467     idx    = 0;
2468     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2469     for (i=1; i<n; i++) {
2470        v    = aa + bs2*ai[i];
2471        vi   = aj + ai[i];
2472        nz   = ai[i+1] - ai[i];
2473       idx   = bs*i;
2474        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2475        while (nz--) {
2476           jdx   = bs*(*vi++);
2477           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2478           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2479           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2480           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2481 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2482 
2483           v   +=  bs2;
2484         }
2485 
2486        x[idx]   = s1;
2487        x[1+idx] = s2;
2488        x[2+idx] = s3;
2489        x[3+idx] = s4;
2490     }
2491 
2492    /* backward solve the upper triangular */
2493   for (i=n-1; i>=0; i--){
2494      v   = aa + bs2*ai[2*n-i];
2495      vi  = aj + ai[2*n-i];
2496      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2497      idt = bs*i;
2498      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2499 
2500     while (nz--) {
2501       idx   = bs*(*vi++);
2502        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2503        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2504        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2505        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2506        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2507 
2508         v   +=  bs2;
2509     }
2510     /* x = inv_diagonal*x */
2511    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2512    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2513    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2514    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2515 
2516   }
2517 
2518   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2519   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2520   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2521   PetscFunctionReturn(0);
2522 }
2523 
2524 
2525 
2526 #undef __FUNCT__
2527 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2528 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2529 {
2530   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2531   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2532   PetscErrorCode ierr;
2533   PetscInt       *diag = a->diag;
2534   MatScalar      *aa=a->a;
2535   PetscScalar    *x,*b;
2536 
2537   PetscFunctionBegin;
2538   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2539   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2540 
2541   {
2542     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2543     MatScalar  *v,*t=(MatScalar *)x;
2544     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2545 
2546     /* forward solve the lower triangular */
2547     idx  = 0;
2548     t[0] = (MatScalar)b[0];
2549     t[1] = (MatScalar)b[1];
2550     t[2] = (MatScalar)b[2];
2551     t[3] = (MatScalar)b[3];
2552     for (i=1; i<n; i++) {
2553       v     =  aa      + 16*ai[i];
2554       vi    =  aj      + ai[i];
2555       nz    =  diag[i] - ai[i];
2556       idx   +=  4;
2557       s1 = (MatScalar)b[idx];
2558       s2 = (MatScalar)b[1+idx];
2559       s3 = (MatScalar)b[2+idx];
2560       s4 = (MatScalar)b[3+idx];
2561       while (nz--) {
2562         jdx = 4*(*vi++);
2563         x1  = t[jdx];
2564         x2  = t[1+jdx];
2565         x3  = t[2+jdx];
2566         x4  = t[3+jdx];
2567         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2568         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2569         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2570         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2571         v    += 16;
2572       }
2573       t[idx]   = s1;
2574       t[1+idx] = s2;
2575       t[2+idx] = s3;
2576       t[3+idx] = s4;
2577     }
2578     /* backward solve the upper triangular */
2579     idt = 4*(n-1);
2580     for (i=n-1; i>=0; i--){
2581       ai16 = 16*diag[i];
2582       v    = aa + ai16 + 16;
2583       vi   = aj + diag[i] + 1;
2584       nz   = ai[i+1] - diag[i] - 1;
2585       s1   = t[idt];
2586       s2   = t[1+idt];
2587       s3   = t[2+idt];
2588       s4   = t[3+idt];
2589       while (nz--) {
2590         idx = 4*(*vi++);
2591         x1  = (MatScalar)x[idx];
2592         x2  = (MatScalar)x[1+idx];
2593         x3  = (MatScalar)x[2+idx];
2594         x4  = (MatScalar)x[3+idx];
2595         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2596         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2597         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2598         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2599         v    += 16;
2600       }
2601       v        = aa + ai16;
2602       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2603       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2604       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2605       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2606       idt -= 4;
2607     }
2608   }
2609 
2610   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2611   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2612   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2613   PetscFunctionReturn(0);
2614 }
2615 
2616 #if defined (PETSC_HAVE_SSE)
2617 
2618 #include PETSC_HAVE_SSE
2619 #undef __FUNCT__
2620 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2621 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2622 {
2623   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2624   unsigned short *aj=(unsigned short *)a->j;
2625   PetscErrorCode ierr;
2626   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2627   MatScalar      *aa=a->a;
2628   PetscScalar    *x,*b;
2629 
2630   PetscFunctionBegin;
2631   SSE_SCOPE_BEGIN;
2632   /*
2633      Note: This code currently uses demotion of double
2634      to float when performing the mixed-mode computation.
2635      This may not be numerically reasonable for all applications.
2636   */
2637   PREFETCH_NTA(aa+16*ai[1]);
2638 
2639   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2640   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2641   {
2642     /* x will first be computed in single precision then promoted inplace to double */
2643     MatScalar      *v,*t=(MatScalar *)x;
2644     int            nz,i,idt,ai16;
2645     unsigned int   jdx,idx;
2646     unsigned short *vi;
2647     /* Forward solve the lower triangular factor. */
2648 
2649     /* First block is the identity. */
2650     idx  = 0;
2651     CONVERT_DOUBLE4_FLOAT4(t,b);
2652     v    =  aa + 16*((unsigned int)ai[1]);
2653 
2654     for (i=1; i<n;) {
2655       PREFETCH_NTA(&v[8]);
2656       vi   =  aj      + ai[i];
2657       nz   =  diag[i] - ai[i];
2658       idx +=  4;
2659 
2660       /* Demote RHS from double to float. */
2661       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2662       LOAD_PS(&t[idx],XMM7);
2663 
2664       while (nz--) {
2665         PREFETCH_NTA(&v[16]);
2666         jdx = 4*((unsigned int)(*vi++));
2667 
2668         /* 4x4 Matrix-Vector product with negative accumulation: */
2669         SSE_INLINE_BEGIN_2(&t[jdx],v)
2670           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2671 
2672           /* First Column */
2673           SSE_COPY_PS(XMM0,XMM6)
2674           SSE_SHUFFLE(XMM0,XMM0,0x00)
2675           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2676           SSE_SUB_PS(XMM7,XMM0)
2677 
2678           /* Second Column */
2679           SSE_COPY_PS(XMM1,XMM6)
2680           SSE_SHUFFLE(XMM1,XMM1,0x55)
2681           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2682           SSE_SUB_PS(XMM7,XMM1)
2683 
2684           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2685 
2686           /* Third Column */
2687           SSE_COPY_PS(XMM2,XMM6)
2688           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2689           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2690           SSE_SUB_PS(XMM7,XMM2)
2691 
2692           /* Fourth Column */
2693           SSE_COPY_PS(XMM3,XMM6)
2694           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2695           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2696           SSE_SUB_PS(XMM7,XMM3)
2697         SSE_INLINE_END_2
2698 
2699         v  += 16;
2700       }
2701       v    =  aa + 16*ai[++i];
2702       PREFETCH_NTA(v);
2703       STORE_PS(&t[idx],XMM7);
2704     }
2705 
2706     /* Backward solve the upper triangular factor.*/
2707 
2708     idt  = 4*(n-1);
2709     ai16 = 16*diag[n-1];
2710     v    = aa + ai16 + 16;
2711     for (i=n-1; i>=0;){
2712       PREFETCH_NTA(&v[8]);
2713       vi = aj + diag[i] + 1;
2714       nz = ai[i+1] - diag[i] - 1;
2715 
2716       LOAD_PS(&t[idt],XMM7);
2717 
2718       while (nz--) {
2719         PREFETCH_NTA(&v[16]);
2720         idx = 4*((unsigned int)(*vi++));
2721 
2722         /* 4x4 Matrix-Vector Product with negative accumulation: */
2723         SSE_INLINE_BEGIN_2(&t[idx],v)
2724           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2725 
2726           /* First Column */
2727           SSE_COPY_PS(XMM0,XMM6)
2728           SSE_SHUFFLE(XMM0,XMM0,0x00)
2729           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2730           SSE_SUB_PS(XMM7,XMM0)
2731 
2732           /* Second Column */
2733           SSE_COPY_PS(XMM1,XMM6)
2734           SSE_SHUFFLE(XMM1,XMM1,0x55)
2735           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2736           SSE_SUB_PS(XMM7,XMM1)
2737 
2738           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2739 
2740           /* Third Column */
2741           SSE_COPY_PS(XMM2,XMM6)
2742           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2743           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2744           SSE_SUB_PS(XMM7,XMM2)
2745 
2746           /* Fourth Column */
2747           SSE_COPY_PS(XMM3,XMM6)
2748           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2749           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2750           SSE_SUB_PS(XMM7,XMM3)
2751         SSE_INLINE_END_2
2752         v  += 16;
2753       }
2754       v    = aa + ai16;
2755       ai16 = 16*diag[--i];
2756       PREFETCH_NTA(aa+ai16+16);
2757       /*
2758          Scale the result by the diagonal 4x4 block,
2759          which was inverted as part of the factorization
2760       */
2761       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2762         /* First Column */
2763         SSE_COPY_PS(XMM0,XMM7)
2764         SSE_SHUFFLE(XMM0,XMM0,0x00)
2765         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2766 
2767         /* Second Column */
2768         SSE_COPY_PS(XMM1,XMM7)
2769         SSE_SHUFFLE(XMM1,XMM1,0x55)
2770         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2771         SSE_ADD_PS(XMM0,XMM1)
2772 
2773         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2774 
2775         /* Third Column */
2776         SSE_COPY_PS(XMM2,XMM7)
2777         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2778         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2779         SSE_ADD_PS(XMM0,XMM2)
2780 
2781         /* Fourth Column */
2782         SSE_COPY_PS(XMM3,XMM7)
2783         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2784         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2785         SSE_ADD_PS(XMM0,XMM3)
2786 
2787         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2788       SSE_INLINE_END_3
2789 
2790       v    = aa + ai16 + 16;
2791       idt -= 4;
2792     }
2793 
2794     /* Convert t from single precision back to double precision (inplace)*/
2795     idt = 4*(n-1);
2796     for (i=n-1;i>=0;i--) {
2797       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2798       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2799       PetscScalar *xtemp=&x[idt];
2800       MatScalar   *ttemp=&t[idt];
2801       xtemp[3] = (PetscScalar)ttemp[3];
2802       xtemp[2] = (PetscScalar)ttemp[2];
2803       xtemp[1] = (PetscScalar)ttemp[1];
2804       xtemp[0] = (PetscScalar)ttemp[0];
2805       idt -= 4;
2806     }
2807 
2808   } /* End of artificial scope. */
2809   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2810   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2811   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2812   SSE_SCOPE_END;
2813   PetscFunctionReturn(0);
2814 }
2815 
2816 #undef __FUNCT__
2817 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2818 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2819 {
2820   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2821   int            *aj=a->j;
2822   PetscErrorCode ierr;
2823   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2824   MatScalar      *aa=a->a;
2825   PetscScalar    *x,*b;
2826 
2827   PetscFunctionBegin;
2828   SSE_SCOPE_BEGIN;
2829   /*
2830      Note: This code currently uses demotion of double
2831      to float when performing the mixed-mode computation.
2832      This may not be numerically reasonable for all applications.
2833   */
2834   PREFETCH_NTA(aa+16*ai[1]);
2835 
2836   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2837   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2838   {
2839     /* x will first be computed in single precision then promoted inplace to double */
2840     MatScalar *v,*t=(MatScalar *)x;
2841     int       nz,i,idt,ai16;
2842     int       jdx,idx;
2843     int       *vi;
2844     /* Forward solve the lower triangular factor. */
2845 
2846     /* First block is the identity. */
2847     idx  = 0;
2848     CONVERT_DOUBLE4_FLOAT4(t,b);
2849     v    =  aa + 16*ai[1];
2850 
2851     for (i=1; i<n;) {
2852       PREFETCH_NTA(&v[8]);
2853       vi   =  aj      + ai[i];
2854       nz   =  diag[i] - ai[i];
2855       idx +=  4;
2856 
2857       /* Demote RHS from double to float. */
2858       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2859       LOAD_PS(&t[idx],XMM7);
2860 
2861       while (nz--) {
2862         PREFETCH_NTA(&v[16]);
2863         jdx = 4*(*vi++);
2864 /*          jdx = *vi++; */
2865 
2866         /* 4x4 Matrix-Vector product with negative accumulation: */
2867         SSE_INLINE_BEGIN_2(&t[jdx],v)
2868           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2869 
2870           /* First Column */
2871           SSE_COPY_PS(XMM0,XMM6)
2872           SSE_SHUFFLE(XMM0,XMM0,0x00)
2873           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2874           SSE_SUB_PS(XMM7,XMM0)
2875 
2876           /* Second Column */
2877           SSE_COPY_PS(XMM1,XMM6)
2878           SSE_SHUFFLE(XMM1,XMM1,0x55)
2879           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2880           SSE_SUB_PS(XMM7,XMM1)
2881 
2882           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2883 
2884           /* Third Column */
2885           SSE_COPY_PS(XMM2,XMM6)
2886           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2887           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2888           SSE_SUB_PS(XMM7,XMM2)
2889 
2890           /* Fourth Column */
2891           SSE_COPY_PS(XMM3,XMM6)
2892           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2893           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2894           SSE_SUB_PS(XMM7,XMM3)
2895         SSE_INLINE_END_2
2896 
2897         v  += 16;
2898       }
2899       v    =  aa + 16*ai[++i];
2900       PREFETCH_NTA(v);
2901       STORE_PS(&t[idx],XMM7);
2902     }
2903 
2904     /* Backward solve the upper triangular factor.*/
2905 
2906     idt  = 4*(n-1);
2907     ai16 = 16*diag[n-1];
2908     v    = aa + ai16 + 16;
2909     for (i=n-1; i>=0;){
2910       PREFETCH_NTA(&v[8]);
2911       vi = aj + diag[i] + 1;
2912       nz = ai[i+1] - diag[i] - 1;
2913 
2914       LOAD_PS(&t[idt],XMM7);
2915 
2916       while (nz--) {
2917         PREFETCH_NTA(&v[16]);
2918         idx = 4*(*vi++);
2919 /*          idx = *vi++; */
2920 
2921         /* 4x4 Matrix-Vector Product with negative accumulation: */
2922         SSE_INLINE_BEGIN_2(&t[idx],v)
2923           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2924 
2925           /* First Column */
2926           SSE_COPY_PS(XMM0,XMM6)
2927           SSE_SHUFFLE(XMM0,XMM0,0x00)
2928           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2929           SSE_SUB_PS(XMM7,XMM0)
2930 
2931           /* Second Column */
2932           SSE_COPY_PS(XMM1,XMM6)
2933           SSE_SHUFFLE(XMM1,XMM1,0x55)
2934           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2935           SSE_SUB_PS(XMM7,XMM1)
2936 
2937           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2938 
2939           /* Third Column */
2940           SSE_COPY_PS(XMM2,XMM6)
2941           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2942           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2943           SSE_SUB_PS(XMM7,XMM2)
2944 
2945           /* Fourth Column */
2946           SSE_COPY_PS(XMM3,XMM6)
2947           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2948           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2949           SSE_SUB_PS(XMM7,XMM3)
2950         SSE_INLINE_END_2
2951         v  += 16;
2952       }
2953       v    = aa + ai16;
2954       ai16 = 16*diag[--i];
2955       PREFETCH_NTA(aa+ai16+16);
2956       /*
2957          Scale the result by the diagonal 4x4 block,
2958          which was inverted as part of the factorization
2959       */
2960       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2961         /* First Column */
2962         SSE_COPY_PS(XMM0,XMM7)
2963         SSE_SHUFFLE(XMM0,XMM0,0x00)
2964         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2965 
2966         /* Second Column */
2967         SSE_COPY_PS(XMM1,XMM7)
2968         SSE_SHUFFLE(XMM1,XMM1,0x55)
2969         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2970         SSE_ADD_PS(XMM0,XMM1)
2971 
2972         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2973 
2974         /* Third Column */
2975         SSE_COPY_PS(XMM2,XMM7)
2976         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2977         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2978         SSE_ADD_PS(XMM0,XMM2)
2979 
2980         /* Fourth Column */
2981         SSE_COPY_PS(XMM3,XMM7)
2982         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2983         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2984         SSE_ADD_PS(XMM0,XMM3)
2985 
2986         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2987       SSE_INLINE_END_3
2988 
2989       v    = aa + ai16 + 16;
2990       idt -= 4;
2991     }
2992 
2993     /* Convert t from single precision back to double precision (inplace)*/
2994     idt = 4*(n-1);
2995     for (i=n-1;i>=0;i--) {
2996       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2997       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2998       PetscScalar *xtemp=&x[idt];
2999       MatScalar   *ttemp=&t[idt];
3000       xtemp[3] = (PetscScalar)ttemp[3];
3001       xtemp[2] = (PetscScalar)ttemp[2];
3002       xtemp[1] = (PetscScalar)ttemp[1];
3003       xtemp[0] = (PetscScalar)ttemp[0];
3004       idt -= 4;
3005     }
3006 
3007   } /* End of artificial scope. */
3008   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3009   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3010   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3011   SSE_SCOPE_END;
3012   PetscFunctionReturn(0);
3013 }
3014 
3015 #endif
3016 #undef __FUNCT__
3017 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3018 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
3019 {
3020   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3021   IS                iscol=a->col,isrow=a->row;
3022   PetscErrorCode    ierr;
3023   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3024   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3025   const MatScalar   *aa=a->a,*v;
3026   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3027   const PetscScalar *b;
3028 
3029   PetscFunctionBegin;
3030   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3031   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3032   t  = a->solve_work;
3033 
3034   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3035   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3036 
3037   /* forward solve the lower triangular */
3038   idx    = 3*(*r++);
3039   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3040   for (i=1; i<n; i++) {
3041     v     = aa + 9*ai[i];
3042     vi    = aj + ai[i];
3043     nz    = diag[i] - ai[i];
3044     idx   = 3*(*r++);
3045     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3046     while (nz--) {
3047       idx   = 3*(*vi++);
3048       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3049       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3050       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3051       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3052       v += 9;
3053     }
3054     idx = 3*i;
3055     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3056   }
3057   /* backward solve the upper triangular */
3058   for (i=n-1; i>=0; i--){
3059     v    = aa + 9*diag[i] + 9;
3060     vi   = aj + diag[i] + 1;
3061     nz   = ai[i+1] - diag[i] - 1;
3062     idt  = 3*i;
3063     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3064     while (nz--) {
3065       idx   = 3*(*vi++);
3066       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3067       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3068       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3069       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3070       v += 9;
3071     }
3072     idc = 3*(*c--);
3073     v   = aa + 9*diag[i];
3074     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3075     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3076     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3077   }
3078   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3079   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3080   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3081   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3082   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3083   PetscFunctionReturn(0);
3084 }
3085 
3086 /*
3087       Special case where the matrix was ILU(0) factored in the natural
3088    ordering. This eliminates the need for the column and row permutation.
3089 */
3090 #undef __FUNCT__
3091 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3092 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
3093 {
3094   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3095   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3096   PetscErrorCode    ierr;
3097   PetscInt          *diag = a->diag;
3098   const MatScalar   *aa=a->a,*v;
3099   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3100   const PetscScalar *b;
3101   PetscInt          jdx,idt,idx,nz,*vi,i;
3102 
3103   PetscFunctionBegin;
3104   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3105   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3106 
3107   /* forward solve the lower triangular */
3108   idx    = 0;
3109   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
3110   for (i=1; i<n; i++) {
3111     v     =  aa      + 9*ai[i];
3112     vi    =  aj      + ai[i];
3113     nz    =  diag[i] - ai[i];
3114     idx   +=  3;
3115     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
3116     while (nz--) {
3117       jdx   = 3*(*vi++);
3118       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3119       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3120       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3121       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3122       v    += 9;
3123     }
3124     x[idx]   = s1;
3125     x[1+idx] = s2;
3126     x[2+idx] = s3;
3127   }
3128   /* backward solve the upper triangular */
3129   for (i=n-1; i>=0; i--){
3130     v    = aa + 9*diag[i] + 9;
3131     vi   = aj + diag[i] + 1;
3132     nz   = ai[i+1] - diag[i] - 1;
3133     idt  = 3*i;
3134     s1 = x[idt];  s2 = x[1+idt];
3135     s3 = x[2+idt];
3136     while (nz--) {
3137       idx   = 3*(*vi++);
3138       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3139       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3140       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3141       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3142       v    += 9;
3143     }
3144     v        = aa +  9*diag[i];
3145     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3146     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3147     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3148   }
3149 
3150   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3151   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3152   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3153   PetscFunctionReturn(0);
3154 }
3155 
3156 #undef __FUNCT__
3157 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3158 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3159 {
3160     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3161     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3162     PetscErrorCode    ierr;
3163     PetscInt          idx,jdx,idt;
3164     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3165     const MatScalar   *aa=a->a,*v;
3166     PetscScalar       *x;
3167     const PetscScalar *b;
3168     PetscScalar        s1,s2,s3,x1,x2,x3;
3169 
3170     PetscFunctionBegin;
3171     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3172     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3173     /* forward solve the lower triangular */
3174     idx    = 0;
3175     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3176     for (i=1; i<n; i++) {
3177        v    = aa + bs2*ai[i];
3178        vi   = aj + ai[i];
3179        nz   = ai[i+1] - ai[i];
3180       idx   = bs*i;
3181        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3182        while (nz--) {
3183           jdx   = bs*(*vi++);
3184           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3185           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3186           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3187           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3188 
3189           v   +=  bs2;
3190         }
3191 
3192        x[idx]   = s1;
3193        x[1+idx] = s2;
3194        x[2+idx] = s3;
3195     }
3196 
3197    /* backward solve the upper triangular */
3198   for (i=n-1; i>=0; i--){
3199      v   = aa + bs2*ai[2*n-i];
3200      vi  = aj + ai[2*n-i];
3201      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3202      idt = bs*i;
3203      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3204 
3205     while (nz--) {
3206       idx   = bs*(*vi++);
3207        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3208        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3209        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3210        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3211 
3212         v   +=  bs2;
3213     }
3214     /* x = inv_diagonal*x */
3215    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3216    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3217    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3218 
3219   }
3220 
3221   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3222   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3223   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3224   PetscFunctionReturn(0);
3225 }
3226 
3227 #undef __FUNCT__
3228 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3229 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
3230 {
3231   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3232   IS                iscol=a->col,isrow=a->row;
3233   PetscErrorCode    ierr;
3234   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3235   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3236   const MatScalar   *aa=a->a,*v;
3237   PetscScalar       *x,s1,s2,x1,x2,*t;
3238   const PetscScalar *b;
3239 
3240   PetscFunctionBegin;
3241   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3242   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3243   t  = a->solve_work;
3244 
3245   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3246   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3247 
3248   /* forward solve the lower triangular */
3249   idx    = 2*(*r++);
3250   t[0] = b[idx]; t[1] = b[1+idx];
3251   for (i=1; i<n; i++) {
3252     v     = aa + 4*ai[i];
3253     vi    = aj + ai[i];
3254     nz    = diag[i] - ai[i];
3255     idx   = 2*(*r++);
3256     s1  = b[idx]; s2 = b[1+idx];
3257     while (nz--) {
3258       idx   = 2*(*vi++);
3259       x1    = t[idx]; x2 = t[1+idx];
3260       s1 -= v[0]*x1 + v[2]*x2;
3261       s2 -= v[1]*x1 + v[3]*x2;
3262       v += 4;
3263     }
3264     idx = 2*i;
3265     t[idx] = s1; t[1+idx] = s2;
3266   }
3267   /* backward solve the upper triangular */
3268   for (i=n-1; i>=0; i--){
3269     v    = aa + 4*diag[i] + 4;
3270     vi   = aj + diag[i] + 1;
3271     nz   = ai[i+1] - diag[i] - 1;
3272     idt  = 2*i;
3273     s1 = t[idt]; s2 = t[1+idt];
3274     while (nz--) {
3275       idx   = 2*(*vi++);
3276       x1    = t[idx]; x2 = t[1+idx];
3277       s1 -= v[0]*x1 + v[2]*x2;
3278       s2 -= v[1]*x1 + v[3]*x2;
3279       v += 4;
3280     }
3281     idc = 2*(*c--);
3282     v   = aa + 4*diag[i];
3283     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3284     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
3285   }
3286   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3287   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3288   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3289   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3290   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3291   PetscFunctionReturn(0);
3292 }
3293 
3294 /*
3295       Special case where the matrix was ILU(0) factored in the natural
3296    ordering. This eliminates the need for the column and row permutation.
3297 */
3298 #undef __FUNCT__
3299 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3300 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
3301 {
3302   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3303   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3304   PetscErrorCode    ierr;
3305   PetscInt          *diag = a->diag;
3306   const MatScalar   *aa=a->a,*v;
3307   PetscScalar       *x,s1,s2,x1,x2;
3308   const PetscScalar *b;
3309   PetscInt          jdx,idt,idx,nz,*vi,i;
3310 
3311   PetscFunctionBegin;
3312   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3313   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3314 
3315   /* forward solve the lower triangular */
3316   idx    = 0;
3317   x[0]   = b[0]; x[1] = b[1];
3318   for (i=1; i<n; i++) {
3319     v     =  aa      + 4*ai[i];
3320     vi    =  aj      + ai[i];
3321     nz    =  diag[i] - ai[i];
3322     idx   +=  2;
3323     s1  =  b[idx];s2 = b[1+idx];
3324     while (nz--) {
3325       jdx   = 2*(*vi++);
3326       x1    = x[jdx];x2 = x[1+jdx];
3327       s1 -= v[0]*x1 + v[2]*x2;
3328       s2 -= v[1]*x1 + v[3]*x2;
3329       v    += 4;
3330     }
3331     x[idx]   = s1;
3332     x[1+idx] = s2;
3333   }
3334   /* backward solve the upper triangular */
3335   for (i=n-1; i>=0; i--){
3336     v    = aa + 4*diag[i] + 4;
3337     vi   = aj + diag[i] + 1;
3338     nz   = ai[i+1] - diag[i] - 1;
3339     idt  = 2*i;
3340     s1 = x[idt];  s2 = x[1+idt];
3341     while (nz--) {
3342       idx   = 2*(*vi++);
3343       x1    = x[idx];   x2 = x[1+idx];
3344       s1 -= v[0]*x1 + v[2]*x2;
3345       s2 -= v[1]*x1 + v[3]*x2;
3346       v    += 4;
3347     }
3348     v        = aa +  4*diag[i];
3349     x[idt]   = v[0]*s1 + v[2]*s2;
3350     x[1+idt] = v[1]*s1 + v[3]*s2;
3351   }
3352 
3353   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3354   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3355   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3356   PetscFunctionReturn(0);
3357 }
3358 
3359 #undef __FUNCT__
3360 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
3361 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3362 {
3363     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3364     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3365     PetscErrorCode    ierr;
3366     PetscInt          jdx;
3367     const MatScalar   *aa=a->a,*v;
3368     PetscScalar       *x,s1,s2,x1,x2;
3369     const PetscScalar *b;
3370 
3371     PetscFunctionBegin;
3372     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3373     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3374     /* forward solve the lower triangular */
3375     idx    = 0;
3376     x[0] = b[idx]; x[1] = b[1+idx];
3377     for (i=1; i<n; i++) {
3378         v   = aa + 4*ai[i];
3379        vi   = aj + ai[i];
3380        nz   = ai[i+1] - ai[i];
3381        idx  = 2*i;
3382        s1   = b[idx];s2 = b[1+idx];
3383        while (nz--) {
3384           jdx   = 2*(*vi++);
3385           x1    = x[jdx];x2 = x[1+jdx];
3386           s1   -= v[0]*x1 + v[2]*x2;
3387           s2   -= v[1]*x1 + v[3]*x2;
3388            v   +=  4;
3389         }
3390        x[idx]   = s1;
3391        x[1+idx] = s2;
3392     }
3393 
3394    /* backward solve the upper triangular */
3395   for (i=n-1; i>=0; i--){
3396      v   = aa + 4*ai[2*n-i];
3397      vi  = aj + ai[2*n-i];
3398      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3399      idt = 2*i;
3400      s1 = x[idt];  s2 = x[1+idt];
3401      while (nz--) {
3402       idx   = 2*(*vi++);
3403        x1    = x[idx];   x2 = x[1+idx];
3404        s1 -= v[0]*x1 + v[2]*x2;
3405        s2 -= v[1]*x1 + v[3]*x2;
3406          v    += 4;
3407     }
3408     /* x = inv_diagonal*x */
3409    x[idt]   = v[0]*s1 + v[2]*s2;
3410    x[1+idt] = v[1]*s1 + v[3]*s2;
3411   }
3412 
3413   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3414   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3415   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3416   PetscFunctionReturn(0);
3417 }
3418 
3419 #undef __FUNCT__
3420 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3421 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
3422 {
3423   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
3424   IS             iscol=a->col,isrow=a->row;
3425   PetscErrorCode ierr;
3426   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3427   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3428   MatScalar      *aa=a->a,*v;
3429   PetscScalar    *x,*b,s1,*t;
3430 
3431   PetscFunctionBegin;
3432   if (!n) PetscFunctionReturn(0);
3433 
3434   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3435   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3436   t  = a->solve_work;
3437 
3438   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3439   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3440 
3441   /* forward solve the lower triangular */
3442   t[0] = b[*r++];
3443   for (i=1; i<n; i++) {
3444     v     = aa + ai[i];
3445     vi    = aj + ai[i];
3446     nz    = diag[i] - ai[i];
3447     s1  = b[*r++];
3448     while (nz--) {
3449       s1 -= (*v++)*t[*vi++];
3450     }
3451     t[i] = s1;
3452   }
3453   /* backward solve the upper triangular */
3454   for (i=n-1; i>=0; i--){
3455     v    = aa + diag[i] + 1;
3456     vi   = aj + diag[i] + 1;
3457     nz   = ai[i+1] - diag[i] - 1;
3458     s1 = t[i];
3459     while (nz--) {
3460       s1 -= (*v++)*t[*vi++];
3461     }
3462     x[*c--] = t[i] = aa[diag[i]]*s1;
3463   }
3464 
3465   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3466   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3467   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3468   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3469   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3470   PetscFunctionReturn(0);
3471 }
3472 /*
3473       Special case where the matrix was ILU(0) factored in the natural
3474    ordering. This eliminates the need for the column and row permutation.
3475 */
3476 #undef __FUNCT__
3477 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3478 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3479 {
3480   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3481   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3482   PetscErrorCode ierr;
3483   PetscInt       *diag = a->diag;
3484   MatScalar      *aa=a->a;
3485   PetscScalar    *x,*b;
3486   PetscScalar    s1,x1;
3487   MatScalar      *v;
3488   PetscInt       jdx,idt,idx,nz,*vi,i;
3489 
3490   PetscFunctionBegin;
3491   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3492   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3493 
3494   /* forward solve the lower triangular */
3495   idx    = 0;
3496   x[0]   = b[0];
3497   for (i=1; i<n; i++) {
3498     v     =  aa      + ai[i];
3499     vi    =  aj      + ai[i];
3500     nz    =  diag[i] - ai[i];
3501     idx   +=  1;
3502     s1  =  b[idx];
3503     while (nz--) {
3504       jdx   = *vi++;
3505       x1    = x[jdx];
3506       s1 -= v[0]*x1;
3507       v    += 1;
3508     }
3509     x[idx]   = s1;
3510   }
3511   /* backward solve the upper triangular */
3512   for (i=n-1; i>=0; i--){
3513     v    = aa + diag[i] + 1;
3514     vi   = aj + diag[i] + 1;
3515     nz   = ai[i+1] - diag[i] - 1;
3516     idt  = i;
3517     s1 = x[idt];
3518     while (nz--) {
3519       idx   = *vi++;
3520       x1    = x[idx];
3521       s1 -= v[0]*x1;
3522       v    += 1;
3523     }
3524     v        = aa +  diag[i];
3525     x[idt]   = v[0]*s1;
3526   }
3527   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3528   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3529   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3530   PetscFunctionReturn(0);
3531 }
3532 
3533 /* ----------------------------------------------------------------*/
3534 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3535 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3536 
3537 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
3538 #undef __FUNCT__
3539 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
3540 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
3541 {
3542   Mat            C=B;
3543   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
3544   IS             isrow = b->row,isicol = b->icol;
3545   PetscErrorCode ierr;
3546   const PetscInt *r,*ic,*ics;
3547   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
3548   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3549   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3550   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3551   MatScalar      *v_work;
3552 
3553   PetscFunctionBegin;
3554   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3555   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3556   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3557   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
3558   ics  = ic;
3559 
3560   /* generate work space needed by dense LU factorization */
3561   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3562   multiplier = v_work + bs;
3563   v_pivots   = (PetscInt*)(multiplier + bs2);
3564 
3565   for (i=0; i<n; i++){
3566     /* zero rtmp */
3567     /* L part */
3568     nz    = bi[i+1] - bi[i];
3569     bjtmp = bj + bi[i];
3570     for  (j=0; j<nz; j++){
3571       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3572     }
3573 
3574     /* U part */
3575     nz = bi[2*n-i+1] - bi[2*n-i];
3576     bjtmp = bj + bi[2*n-i];
3577     for  (j=0; j<nz; j++){
3578       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3579     }
3580 
3581     /* load in initial (unfactored row) */
3582     nz    = ai[r[i]+1] - ai[r[i]];
3583     ajtmp = aj + ai[r[i]];
3584     v     = aa + bs2*ai[r[i]];
3585     for (j=0; j<nz; j++) {
3586       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
3587     }
3588 
3589     /* elimination */
3590     bjtmp = bj + bi[i];
3591     row   = *bjtmp++;
3592     nzL   = bi[i+1] - bi[i];
3593     k   = 0;
3594     while  (k < nzL) {
3595       pc = rtmp + bs2*row;
3596       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3597       if (flg) {
3598         pv         = b->a + bs2*bdiag[row];
3599         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
3600         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3601         pv         = b->a + bs2*bi[2*n-row];
3602         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3603         for (j=0; j<nz; j++) {
3604           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3605         }
3606         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
3607       }
3608       row = *bjtmp++; k++;
3609     }
3610 
3611     /* finished row so stick it into b->a */
3612     /* L part */
3613     pv   = b->a + bs2*bi[i] ;
3614     pj   = b->j + bi[i] ;
3615     nz   = bi[i+1] - bi[i];
3616     for (j=0; j<nz; j++) {
3617       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3618     }
3619 
3620     /* Mark diagonal and invert diagonal for simplier triangular solves */
3621     pv  = b->a + bs2*bdiag[i];
3622     pj  = b->j + bdiag[i];
3623     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3624     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3625     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
3626 
3627     /* U part */
3628     pv = b->a + bs2*bi[2*n-i];
3629     pj = b->j + bi[2*n-i];
3630     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3631     for (j=0; j<nz; j++){
3632       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3633     }
3634   }
3635 
3636   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3637   ierr = PetscFree(v_work);CHKERRQ(ierr);
3638   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3639   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3640 
3641   switch (A->rmap->bs){
3642   case 2:
3643     C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
3644     break;
3645   case 3:
3646     C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
3647     break;
3648   case 4:
3649     C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
3650     break;
3651   case 5:
3652     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3653     break;
3654   case 6:
3655     C->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
3656     break;
3657   case 7:
3658     C->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
3659     break;
3660   default:
3661     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3662     break;
3663   }
3664   C->assembled = PETSC_TRUE;
3665   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
3666   PetscFunctionReturn(0);
3667 }
3668 
3669 /*
3670    ilu(0) with natural ordering under new data structure.
3671    Factored arrays bj and ba are stored as
3672      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
3673 
3674    bi=fact->i is an array of size 2n+2, in which
3675    bi+
3676      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
3677      bi[n]      ->  end of L(n-1,:)+1
3678      bi[n+1]    ->  1st entry of U(n-1,:)
3679      bi[2n-i]   ->  1st entry of U(i,:)
3680      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
3681      bi[2n]     ->  end of U(0,:)+1
3682 
3683    U(i,:) contains diag[i] as its last entry, i.e.,
3684     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
3685 */
3686 #undef __FUNCT__
3687 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
3688 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3689 {
3690 
3691   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
3692   PetscErrorCode     ierr;
3693   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
3694   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
3695 
3696   PetscFunctionBegin;
3697   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3698   b     = (Mat_SeqBAIJ*)(fact)->data;
3699   bdiag = b->diag;
3700 
3701   /* replace matrix arrays with single allocations, then reset values */
3702   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
3703 
3704   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
3705   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
3706   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
3707   b->singlemalloc = PETSC_FALSE;
3708   if (mbs > 0) {
3709     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
3710   }
3711 
3712   /* set bi and bj with new data structure */
3713   bi = b->i;
3714   bj = b->j;
3715 
3716   /* L part */
3717   bi[0] = 0;
3718   for (i=0; i<mbs; i++){
3719     nz = adiag[i] - ai[i];
3720     bi[i+1] = bi[i] + nz;
3721     aj = a->j + ai[i];
3722     for (j=0; j<nz; j++){
3723       *bj = aj[j]; bj++;
3724     }
3725   }
3726 
3727   /* U part */
3728   bi[mbs+1] = bi[mbs];
3729   for (i=mbs-1; i>=0; i--){
3730     nz = ai[i+1] - adiag[i] - 1;
3731     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3732     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
3733     aj = a->j + adiag[i] + 1;
3734     for (j=0; j<nz; j++){
3735       *bj = aj[j]; bj++;
3736     }
3737     /* diag[i] */
3738     *bj = i; bj++;
3739     bdiag[i] = bi[2*mbs-i+1]-1;
3740   }
3741   PetscFunctionReturn(0);
3742 }
3743 
3744 /*
3745      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3746    except that the data structure of Mat_SeqAIJ is slightly different.
3747    Not a good example of code reuse.
3748 */
3749 
3750 #undef __FUNCT__
3751 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3752 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3753 {
3754   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3755   IS             isicol;
3756   PetscErrorCode ierr;
3757   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3758   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3759   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3760   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3761   PetscTruth     col_identity,row_identity,both_identity,flg;
3762   PetscReal      f;
3763 
3764   PetscFunctionBegin;
3765   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
3766   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
3767 
3768   f             = info->fill;
3769   levels        = (PetscInt)info->levels;
3770   diagonal_fill = (PetscInt)info->diagonal_fill;
3771   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3772   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3773   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3774   both_identity = (PetscTruth) (row_identity && col_identity);
3775 
3776   if (!levels && both_identity) {  /* special case copy the nonzero structure */
3777 
3778     PetscTruth newdatastruct=PETSC_FALSE;
3779     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
3780     if (newdatastruct){
3781       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
3782       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3783     } else {
3784       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3785       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3786     }
3787 
3788     fact->factor = MAT_FACTOR_ILU;
3789     b            = (Mat_SeqBAIJ*)(fact)->data;
3790     b->row       = isrow;
3791     b->col       = iscol;
3792     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3793     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3794     b->icol      = isicol;
3795     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3796     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3797     PetscFunctionReturn(0);
3798   }
3799 
3800   /* general case perform the symbolic factorization */
3801     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3802     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3803 
3804     /* get new row pointers */
3805     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3806     ainew[0] = 0;
3807     /* don't know how many column pointers are needed so estimate */
3808     jmax = (PetscInt)(f*ai[n] + 1);
3809     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3810     /* ajfill is level of fill for each fill entry */
3811     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3812     /* fill is a linked list of nonzeros in active row */
3813     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3814     /* im is level for each filled value */
3815     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3816     /* dloc is location of diagonal in factor */
3817     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3818     dloc[0]  = 0;
3819     for (prow=0; prow<n; prow++) {
3820 
3821       /* copy prow into linked list */
3822       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3823       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
3824       xi         = aj + ai[r[prow]];
3825       fill[n]    = n;
3826       fill[prow] = -1; /* marker for diagonal entry */
3827       while (nz--) {
3828 	fm  = n;
3829 	idx = ic[*xi++];
3830 	do {
3831 	  m  = fm;
3832 	  fm = fill[m];
3833 	} while (fm < idx);
3834 	fill[m]   = idx;
3835 	fill[idx] = fm;
3836 	im[idx]   = 0;
3837       }
3838 
3839       /* make sure diagonal entry is included */
3840       if (diagonal_fill && fill[prow] == -1) {
3841 	fm = n;
3842 	while (fill[fm] < prow) fm = fill[fm];
3843 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3844 	fill[fm]   = prow;
3845 	im[prow]   = 0;
3846 	nzf++;
3847 	dcount++;
3848       }
3849 
3850       nzi = 0;
3851       row = fill[n];
3852       while (row < prow) {
3853 	incrlev = im[row] + 1;
3854 	nz      = dloc[row];
3855 	xi      = ajnew  + ainew[row] + nz + 1;
3856 	flev    = ajfill + ainew[row] + nz + 1;
3857 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3858 	fm      = row;
3859 	while (nnz-- > 0) {
3860 	  idx = *xi++;
3861 	  if (*flev + incrlev > levels) {
3862 	    flev++;
3863 	    continue;
3864 	  }
3865 	  do {
3866 	    m  = fm;
3867 	    fm = fill[m];
3868 	  } while (fm < idx);
3869 	  if (fm != idx) {
3870 	    im[idx]   = *flev + incrlev;
3871 	    fill[m]   = idx;
3872 	    fill[idx] = fm;
3873 	    fm        = idx;
3874 	    nzf++;
3875 	  } else {
3876 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3877 	  }
3878 	  flev++;
3879 	}
3880 	row = fill[row];
3881 	nzi++;
3882       }
3883       /* copy new filled row into permanent storage */
3884       ainew[prow+1] = ainew[prow] + nzf;
3885       if (ainew[prow+1] > jmax) {
3886 
3887 	/* estimate how much additional space we will need */
3888 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3889 	/* just double the memory each time */
3890 	PetscInt maxadd = jmax;
3891 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3892 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3893 	jmax += maxadd;
3894 
3895 	/* allocate a longer ajnew and ajfill */
3896 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3897 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3898 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3899 	ajnew = xitmp;
3900 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3901 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3902 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3903 	ajfill = xitmp;
3904 	reallocate++; /* count how many reallocations are needed */
3905       }
3906       xitmp       = ajnew + ainew[prow];
3907       flev        = ajfill + ainew[prow];
3908       dloc[prow]  = nzi;
3909       fm          = fill[n];
3910       while (nzf--) {
3911 	*xitmp++ = fm;
3912 	*flev++ = im[fm];
3913 	fm      = fill[fm];
3914       }
3915       /* make sure row has diagonal entry */
3916       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3917 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3918     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3919       }
3920     }
3921     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3922     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3923     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3924     ierr = PetscFree(fill);CHKERRQ(ierr);
3925     ierr = PetscFree(im);CHKERRQ(ierr);
3926 
3927 #if defined(PETSC_USE_INFO)
3928     {
3929       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3930       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3931       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3932       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3933       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3934       if (diagonal_fill) {
3935 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3936       }
3937     }
3938 #endif
3939 
3940     /* put together the new matrix */
3941     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3942     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3943     b    = (Mat_SeqBAIJ*)(fact)->data;
3944     b->free_a       = PETSC_TRUE;
3945     b->free_ij      = PETSC_TRUE;
3946     b->singlemalloc = PETSC_FALSE;
3947     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3948     b->j          = ajnew;
3949     b->i          = ainew;
3950     for (i=0; i<n; i++) dloc[i] += ainew[i];
3951     b->diag       = dloc;
3952     b->ilen       = 0;
3953     b->imax       = 0;
3954     b->row        = isrow;
3955     b->col        = iscol;
3956     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3957     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3958     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3959     b->icol       = isicol;
3960     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3961     /* In b structure:  Free imax, ilen, old a, old j.
3962        Allocate dloc, solve_work, new a, new j */
3963     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3964     b->maxnz          = b->nz = ainew[n];
3965 
3966     (fact)->info.factor_mallocs    = reallocate;
3967     (fact)->info.fill_ratio_given  = f;
3968     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3969 
3970   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3971   PetscFunctionReturn(0);
3972 }
3973 
3974 #undef __FUNCT__
3975 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3976 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3977 {
3978   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3979   /* int i,*AJ=a->j,nz=a->nz; */
3980   PetscFunctionBegin;
3981   /* Undo Column scaling */
3982 /*    while (nz--) { */
3983 /*      AJ[i] = AJ[i]/4; */
3984 /*    } */
3985   /* This should really invoke a push/pop logic, but we don't have that yet. */
3986   A->ops->setunfactored = PETSC_NULL;
3987   PetscFunctionReturn(0);
3988 }
3989 
3990 #undef __FUNCT__
3991 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3992 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3993 {
3994   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3995   PetscInt       *AJ=a->j,nz=a->nz;
3996   unsigned short *aj=(unsigned short *)AJ;
3997   PetscFunctionBegin;
3998   /* Is this really necessary? */
3999   while (nz--) {
4000     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
4001   }
4002   A->ops->setunfactored = PETSC_NULL;
4003   PetscFunctionReturn(0);
4004 }
4005 
4006 
4007