xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 5aef685b1babf92f4c12eaa16cfdd1dc6e433c19)
1 #define PETSCMAT_DLL
2 
3 
4 /*
5     Factorization code for BAIJ format.
6 */
7 
8 #include "../src/mat/impls/baij/seq/baij.h"
9 #include "../src/mat/blockinvert.h"
10 
11 #undef __FUNCT__
12 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13 PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14 {
15   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16   PetscErrorCode ierr;
17   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18   PetscInt       *diag = a->diag;
19   MatScalar      *aa=a->a,*v;
20   PetscScalar    s1,*x,*b;
21 
22   PetscFunctionBegin;
23   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
24   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26 
27   /* forward solve the U^T */
28   for (i=0; i<n; i++) {
29 
30     v     = aa + diag[i];
31     /* multiply by the inverse of the block diagonal */
32     s1    = (*v++)*x[i];
33     vi    = aj + diag[i] + 1;
34     nz    = ai[i+1] - diag[i] - 1;
35     while (nz--) {
36       x[*vi++]  -= (*v++)*s1;
37     }
38     x[i]   = s1;
39   }
40   /* backward solve the L^T */
41   for (i=n-1; i>=0; i--){
42     v    = aa + diag[i] - 1;
43     vi   = aj + diag[i] - 1;
44     nz   = diag[i] - ai[i];
45     s1   = x[i];
46     while (nz--) {
47       x[*vi--]   -=  (*v--)*s1;
48     }
49   }
50   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
51   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53   PetscFunctionReturn(0);
54 }
55 
56 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
57 {
58     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
59     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60     PetscErrorCode    ierr;
61     PetscInt          jdx;
62     const MatScalar   *aa=a->a,*v;
63     PetscScalar       *x,s1,s2,x1,x2;
64     const PetscScalar *b;
65 
66     PetscFunctionBegin;
67     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
68     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
69     /* forward solve the lower triangular */
70     idx    = 0;
71     x[0] = b[idx]; x[1] = b[1+idx];
72     for (i=1; i<n; i++) {
73         v   = aa + 4*ai[i];
74        vi   = aj + ai[i];
75        nz   = ai[i+1] - ai[i];
76        idx  = 2*i;
77        s1   = b[idx];s2 = b[1+idx];
78        while (nz--) {
79           jdx   = 2*(*vi++);
80           x1    = x[jdx];x2 = x[1+jdx];
81           s1   -= v[0]*x1 + v[2]*x2;
82           s2   -= v[1]*x1 + v[3]*x2;
83            v   +=  4;
84         }
85        x[idx]   = s1;
86        x[1+idx] = s2;
87     }
88 
89    /* backward solve the upper triangular */
90   for (i=n-1; i>=0; i--){
91      v   = aa + 4*ai[2*n-i];
92      vi  = aj + ai[2*n-i];
93      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
94      idt = 2*i;
95      s1 = x[idt];  s2 = x[1+idt];
96      while (nz--) {
97       idx   = 2*(*vi++);
98        x1    = x[idx];   x2 = x[1+idx];
99        s1 -= v[0]*x1 + v[2]*x2;
100        s2 -= v[1]*x1 + v[3]*x2;
101          v    += 4;
102     }
103     /* x = inv_diagonal*x */
104    x[idt]   = v[0]*s1 + v[2]*s2;
105    x[1+idt] = v[1]*s1 + v[3]*s2;
106   }
107 
108   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
109   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
110   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
111   PetscFunctionReturn(0);
112 }
113 
114 #undef __FUNCT__
115 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
116 PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
117 {
118   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
119   PetscErrorCode ierr;
120   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
121   PetscInt       *diag = a->diag,oidx;
122   MatScalar      *aa=a->a,*v;
123   PetscScalar    s1,s2,x1,x2;
124   PetscScalar    *x,*b;
125 
126   PetscFunctionBegin;
127   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
128   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
129   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
130 
131   /* forward solve the U^T */
132   idx = 0;
133   for (i=0; i<n; i++) {
134 
135     v     = aa + 4*diag[i];
136     /* multiply by the inverse of the block diagonal */
137     x1 = x[idx];   x2 = x[1+idx];
138     s1 = v[0]*x1  +  v[1]*x2;
139     s2 = v[2]*x1  +  v[3]*x2;
140     v += 4;
141 
142     vi    = aj + diag[i] + 1;
143     nz    = ai[i+1] - diag[i] - 1;
144     while (nz--) {
145       oidx = 2*(*vi++);
146       x[oidx]   -= v[0]*s1  +  v[1]*s2;
147       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
148       v  += 4;
149     }
150     x[idx]   = s1;x[1+idx] = s2;
151     idx += 2;
152   }
153   /* backward solve the L^T */
154   for (i=n-1; i>=0; i--){
155     v    = aa + 4*diag[i] - 4;
156     vi   = aj + diag[i] - 1;
157     nz   = diag[i] - ai[i];
158     idt  = 2*i;
159     s1   = x[idt];  s2 = x[1+idt];
160     while (nz--) {
161       idx   = 2*(*vi--);
162       x[idx]   -=  v[0]*s1 +  v[1]*s2;
163       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
164       v -= 4;
165     }
166   }
167   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
168   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
169   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
170   PetscFunctionReturn(0);
171 }
172 
173 #undef __FUNCT__
174 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
175 PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
176 {
177   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
178   PetscErrorCode ierr;
179   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180   PetscInt       *diag = a->diag,oidx;
181   MatScalar      *aa=a->a,*v;
182   PetscScalar    s1,s2,s3,x1,x2,x3;
183   PetscScalar    *x,*b;
184 
185   PetscFunctionBegin;
186   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
187   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189 
190   /* forward solve the U^T */
191   idx = 0;
192   for (i=0; i<n; i++) {
193 
194     v     = aa + 9*diag[i];
195     /* multiply by the inverse of the block diagonal */
196     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
197     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
198     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
199     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
200     v += 9;
201 
202     vi    = aj + diag[i] + 1;
203     nz    = ai[i+1] - diag[i] - 1;
204     while (nz--) {
205       oidx = 3*(*vi++);
206       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
207       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
208       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
209       v  += 9;
210     }
211     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
212     idx += 3;
213   }
214   /* backward solve the L^T */
215   for (i=n-1; i>=0; i--){
216     v    = aa + 9*diag[i] - 9;
217     vi   = aj + diag[i] - 1;
218     nz   = diag[i] - ai[i];
219     idt  = 3*i;
220     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
221     while (nz--) {
222       idx   = 3*(*vi--);
223       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
224       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
225       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
226       v -= 9;
227     }
228   }
229   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
230   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
231   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
232   PetscFunctionReturn(0);
233 }
234 
235 #undef __FUNCT__
236 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
237 PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
238 {
239   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
240   PetscErrorCode ierr;
241   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
242   PetscInt       *diag = a->diag,oidx;
243   MatScalar      *aa=a->a,*v;
244   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
245   PetscScalar    *x,*b;
246 
247   PetscFunctionBegin;
248   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
249   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
250   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251 
252   /* forward solve the U^T */
253   idx = 0;
254   for (i=0; i<n; i++) {
255 
256     v     = aa + 16*diag[i];
257     /* multiply by the inverse of the block diagonal */
258     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
259     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
260     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
261     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
262     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
263     v += 16;
264 
265     vi    = aj + diag[i] + 1;
266     nz    = ai[i+1] - diag[i] - 1;
267     while (nz--) {
268       oidx = 4*(*vi++);
269       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
270       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
271       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
272       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
273       v  += 16;
274     }
275     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
276     idx += 4;
277   }
278   /* backward solve the L^T */
279   for (i=n-1; i>=0; i--){
280     v    = aa + 16*diag[i] - 16;
281     vi   = aj + diag[i] - 1;
282     nz   = diag[i] - ai[i];
283     idt  = 4*i;
284     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
285     while (nz--) {
286       idx   = 4*(*vi--);
287       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
288       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
289       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
290       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
291       v -= 16;
292     }
293   }
294   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
297   PetscFunctionReturn(0);
298 }
299 
300 #undef __FUNCT__
301 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
302 PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
303 {
304   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305   PetscErrorCode ierr;
306   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307   PetscInt       *diag = a->diag,oidx;
308   MatScalar      *aa=a->a,*v;
309   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
310   PetscScalar    *x,*b;
311 
312   PetscFunctionBegin;
313   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
315   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316 
317   /* forward solve the U^T */
318   idx = 0;
319   for (i=0; i<n; i++) {
320 
321     v     = aa + 25*diag[i];
322     /* multiply by the inverse of the block diagonal */
323     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
324     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
325     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
326     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
327     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
328     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
329     v += 25;
330 
331     vi    = aj + diag[i] + 1;
332     nz    = ai[i+1] - diag[i] - 1;
333     while (nz--) {
334       oidx = 5*(*vi++);
335       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
336       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
337       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
338       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
339       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
340       v  += 25;
341     }
342     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
343     idx += 5;
344   }
345   /* backward solve the L^T */
346   for (i=n-1; i>=0; i--){
347     v    = aa + 25*diag[i] - 25;
348     vi   = aj + diag[i] - 1;
349     nz   = diag[i] - ai[i];
350     idt  = 5*i;
351     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
352     while (nz--) {
353       idx   = 5*(*vi--);
354       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
355       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
356       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
357       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
358       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
359       v -= 25;
360     }
361   }
362   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
363   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
364   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
365   PetscFunctionReturn(0);
366 }
367 
368 #undef __FUNCT__
369 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
370 PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
371 {
372   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
373   PetscErrorCode ierr;
374   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
375   PetscInt       *diag = a->diag,oidx;
376   MatScalar      *aa=a->a,*v;
377   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
378   PetscScalar    *x,*b;
379 
380   PetscFunctionBegin;
381   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
382   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
383   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
384 
385   /* forward solve the U^T */
386   idx = 0;
387   for (i=0; i<n; i++) {
388 
389     v     = aa + 36*diag[i];
390     /* multiply by the inverse of the block diagonal */
391     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
392     x6    = x[5+idx];
393     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
394     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
395     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
396     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
397     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
398     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
399     v += 36;
400 
401     vi    = aj + diag[i] + 1;
402     nz    = ai[i+1] - diag[i] - 1;
403     while (nz--) {
404       oidx = 6*(*vi++);
405       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
406       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
407       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
408       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
409       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
410       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
411       v  += 36;
412     }
413     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
414     x[5+idx] = s6;
415     idx += 6;
416   }
417   /* backward solve the L^T */
418   for (i=n-1; i>=0; i--){
419     v    = aa + 36*diag[i] - 36;
420     vi   = aj + diag[i] - 1;
421     nz   = diag[i] - ai[i];
422     idt  = 6*i;
423     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
424     s6 = x[5+idt];
425     while (nz--) {
426       idx   = 6*(*vi--);
427       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
428       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
429       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
430       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
431       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
432       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
433       v -= 36;
434     }
435   }
436   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
437   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
438   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
439   PetscFunctionReturn(0);
440 }
441 
442 #undef __FUNCT__
443 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
444 PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
445 {
446   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
447   PetscErrorCode ierr;
448   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
449   PetscInt       *diag = a->diag,oidx;
450   MatScalar      *aa=a->a,*v;
451   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
452   PetscScalar    *x,*b;
453 
454   PetscFunctionBegin;
455   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
456   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
457   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
458 
459   /* forward solve the U^T */
460   idx = 0;
461   for (i=0; i<n; i++) {
462 
463     v     = aa + 49*diag[i];
464     /* multiply by the inverse of the block diagonal */
465     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
466     x6    = x[5+idx]; x7 = x[6+idx];
467     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
468     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
469     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
470     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
471     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
472     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
473     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
474     v += 49;
475 
476     vi    = aj + diag[i] + 1;
477     nz    = ai[i+1] - diag[i] - 1;
478     while (nz--) {
479       oidx = 7*(*vi++);
480       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
481       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
482       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
483       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
484       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
485       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
486       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
487       v  += 49;
488     }
489     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
490     x[5+idx] = s6;x[6+idx] = s7;
491     idx += 7;
492   }
493   /* backward solve the L^T */
494   for (i=n-1; i>=0; i--){
495     v    = aa + 49*diag[i] - 49;
496     vi   = aj + diag[i] - 1;
497     nz   = diag[i] - ai[i];
498     idt  = 7*i;
499     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
500     s6 = x[5+idt];s7 = x[6+idt];
501     while (nz--) {
502       idx   = 7*(*vi--);
503       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
504       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
505       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
506       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
507       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
508       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
509       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
510       v -= 49;
511     }
512   }
513   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
514   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
515   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
516   PetscFunctionReturn(0);
517 }
518 
519 /*---------------------------------------------------------------------------------------------*/
520 #undef __FUNCT__
521 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
522 PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
523 {
524   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
525   IS             iscol=a->col,isrow=a->row;
526   PetscErrorCode ierr;
527   const PetscInt *r,*c,*rout,*cout;
528   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
529   PetscInt       *diag = a->diag;
530   MatScalar      *aa=a->a,*v;
531   PetscScalar    s1,*x,*b,*t;
532 
533   PetscFunctionBegin;
534   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
535   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
536   t  = a->solve_work;
537 
538   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
539   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
540 
541   /* copy the b into temp work space according to permutation */
542   for (i=0; i<n; i++) {
543     t[i] = b[c[i]];
544   }
545 
546   /* forward solve the U^T */
547   for (i=0; i<n; i++) {
548 
549     v     = aa + diag[i];
550     /* multiply by the inverse of the block diagonal */
551     s1    = (*v++)*t[i];
552     vi    = aj + diag[i] + 1;
553     nz    = ai[i+1] - diag[i] - 1;
554     while (nz--) {
555       t[*vi++]  -= (*v++)*s1;
556     }
557     t[i]   = s1;
558   }
559   /* backward solve the L^T */
560   for (i=n-1; i>=0; i--){
561     v    = aa + diag[i] - 1;
562     vi   = aj + diag[i] - 1;
563     nz   = diag[i] - ai[i];
564     s1   = t[i];
565     while (nz--) {
566       t[*vi--]   -=  (*v--)*s1;
567     }
568   }
569 
570   /* copy t into x according to permutation */
571   for (i=0; i<n; i++) {
572     x[r[i]]   = t[i];
573   }
574 
575   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
576   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
577   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
578   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
579   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
580   PetscFunctionReturn(0);
581 }
582 
583 #undef __FUNCT__
584 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
585 PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
586 {
587   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
588   IS             iscol=a->col,isrow=a->row;
589   PetscErrorCode ierr;
590   const PetscInt *r,*c,*rout,*cout;
591   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
592   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
593   MatScalar      *aa=a->a,*v;
594   PetscScalar    s1,s2,x1,x2;
595   PetscScalar    *x,*b,*t;
596 
597   PetscFunctionBegin;
598   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
599   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600   t  = a->solve_work;
601 
602   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
603   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
604 
605   /* copy the b into temp work space according to permutation */
606   ii = 0;
607   for (i=0; i<n; i++) {
608     ic      = 2*c[i];
609     t[ii]   = b[ic];
610     t[ii+1] = b[ic+1];
611     ii += 2;
612   }
613 
614   /* forward solve the U^T */
615   idx = 0;
616   for (i=0; i<n; i++) {
617 
618     v     = aa + 4*diag[i];
619     /* multiply by the inverse of the block diagonal */
620     x1    = t[idx];   x2 = t[1+idx];
621     s1 = v[0]*x1  +  v[1]*x2;
622     s2 = v[2]*x1  +  v[3]*x2;
623     v += 4;
624 
625     vi    = aj + diag[i] + 1;
626     nz    = ai[i+1] - diag[i] - 1;
627     while (nz--) {
628       oidx = 2*(*vi++);
629       t[oidx]   -= v[0]*s1  +  v[1]*s2;
630       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
631       v  += 4;
632     }
633     t[idx]   = s1;t[1+idx] = s2;
634     idx += 2;
635   }
636   /* backward solve the L^T */
637   for (i=n-1; i>=0; i--){
638     v    = aa + 4*diag[i] - 4;
639     vi   = aj + diag[i] - 1;
640     nz   = diag[i] - ai[i];
641     idt  = 2*i;
642     s1 = t[idt];  s2 = t[1+idt];
643     while (nz--) {
644       idx   = 2*(*vi--);
645       t[idx]   -=  v[0]*s1 +  v[1]*s2;
646       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
647       v -= 4;
648     }
649   }
650 
651   /* copy t into x according to permutation */
652   ii = 0;
653   for (i=0; i<n; i++) {
654     ir      = 2*r[i];
655     x[ir]   = t[ii];
656     x[ir+1] = t[ii+1];
657     ii += 2;
658   }
659 
660   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
661   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
662   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
663   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
664   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
665   PetscFunctionReturn(0);
666 }
667 
668 #undef __FUNCT__
669 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
670 PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
671 {
672   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
673   IS             iscol=a->col,isrow=a->row;
674   PetscErrorCode ierr;
675   const PetscInt *r,*c,*rout,*cout;
676   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
677   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
678   MatScalar      *aa=a->a,*v;
679   PetscScalar    s1,s2,s3,x1,x2,x3;
680   PetscScalar    *x,*b,*t;
681 
682   PetscFunctionBegin;
683   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
684   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
685   t  = a->solve_work;
686 
687   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
688   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
689 
690   /* copy the b into temp work space according to permutation */
691   ii = 0;
692   for (i=0; i<n; i++) {
693     ic      = 3*c[i];
694     t[ii]   = b[ic];
695     t[ii+1] = b[ic+1];
696     t[ii+2] = b[ic+2];
697     ii += 3;
698   }
699 
700   /* forward solve the U^T */
701   idx = 0;
702   for (i=0; i<n; i++) {
703 
704     v     = aa + 9*diag[i];
705     /* multiply by the inverse of the block diagonal */
706     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
707     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
708     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
709     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
710     v += 9;
711 
712     vi    = aj + diag[i] + 1;
713     nz    = ai[i+1] - diag[i] - 1;
714     while (nz--) {
715       oidx = 3*(*vi++);
716       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
717       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
718       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
719       v  += 9;
720     }
721     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
722     idx += 3;
723   }
724   /* backward solve the L^T */
725   for (i=n-1; i>=0; i--){
726     v    = aa + 9*diag[i] - 9;
727     vi   = aj + diag[i] - 1;
728     nz   = diag[i] - ai[i];
729     idt  = 3*i;
730     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
731     while (nz--) {
732       idx   = 3*(*vi--);
733       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
734       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
735       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
736       v -= 9;
737     }
738   }
739 
740   /* copy t into x according to permutation */
741   ii = 0;
742   for (i=0; i<n; i++) {
743     ir      = 3*r[i];
744     x[ir]   = t[ii];
745     x[ir+1] = t[ii+1];
746     x[ir+2] = t[ii+2];
747     ii += 3;
748   }
749 
750   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
751   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
752   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
753   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
754   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
755   PetscFunctionReturn(0);
756 }
757 
758 #undef __FUNCT__
759 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
760 PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
761 {
762   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
763   IS             iscol=a->col,isrow=a->row;
764   PetscErrorCode ierr;
765   const PetscInt *r,*c,*rout,*cout;
766   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
767   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
768   MatScalar      *aa=a->a,*v;
769   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
770   PetscScalar    *x,*b,*t;
771 
772   PetscFunctionBegin;
773   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
774   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
775   t  = a->solve_work;
776 
777   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
778   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
779 
780   /* copy the b into temp work space according to permutation */
781   ii = 0;
782   for (i=0; i<n; i++) {
783     ic      = 4*c[i];
784     t[ii]   = b[ic];
785     t[ii+1] = b[ic+1];
786     t[ii+2] = b[ic+2];
787     t[ii+3] = b[ic+3];
788     ii += 4;
789   }
790 
791   /* forward solve the U^T */
792   idx = 0;
793   for (i=0; i<n; i++) {
794 
795     v     = aa + 16*diag[i];
796     /* multiply by the inverse of the block diagonal */
797     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
798     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
799     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
800     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
801     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
802     v += 16;
803 
804     vi    = aj + diag[i] + 1;
805     nz    = ai[i+1] - diag[i] - 1;
806     while (nz--) {
807       oidx = 4*(*vi++);
808       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
809       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
810       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
811       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
812       v  += 16;
813     }
814     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
815     idx += 4;
816   }
817   /* backward solve the L^T */
818   for (i=n-1; i>=0; i--){
819     v    = aa + 16*diag[i] - 16;
820     vi   = aj + diag[i] - 1;
821     nz   = diag[i] - ai[i];
822     idt  = 4*i;
823     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
824     while (nz--) {
825       idx   = 4*(*vi--);
826       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
827       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
828       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
829       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
830       v -= 16;
831     }
832   }
833 
834   /* copy t into x according to permutation */
835   ii = 0;
836   for (i=0; i<n; i++) {
837     ir      = 4*r[i];
838     x[ir]   = t[ii];
839     x[ir+1] = t[ii+1];
840     x[ir+2] = t[ii+2];
841     x[ir+3] = t[ii+3];
842     ii += 4;
843   }
844 
845   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
846   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
847   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
848   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
849   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
850   PetscFunctionReturn(0);
851 }
852 
853 #undef __FUNCT__
854 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
855 PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
856 {
857   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
858   IS             iscol=a->col,isrow=a->row;
859   PetscErrorCode ierr;
860   const PetscInt *r,*c,*rout,*cout;
861   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
862   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
863   MatScalar      *aa=a->a,*v;
864   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
865   PetscScalar    *x,*b,*t;
866 
867   PetscFunctionBegin;
868   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
869   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
870   t  = a->solve_work;
871 
872   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
873   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
874 
875   /* copy the b into temp work space according to permutation */
876   ii = 0;
877   for (i=0; i<n; i++) {
878     ic      = 5*c[i];
879     t[ii]   = b[ic];
880     t[ii+1] = b[ic+1];
881     t[ii+2] = b[ic+2];
882     t[ii+3] = b[ic+3];
883     t[ii+4] = b[ic+4];
884     ii += 5;
885   }
886 
887   /* forward solve the U^T */
888   idx = 0;
889   for (i=0; i<n; i++) {
890 
891     v     = aa + 25*diag[i];
892     /* multiply by the inverse of the block diagonal */
893     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
894     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
895     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
896     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
897     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
898     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
899     v += 25;
900 
901     vi    = aj + diag[i] + 1;
902     nz    = ai[i+1] - diag[i] - 1;
903     while (nz--) {
904       oidx = 5*(*vi++);
905       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
906       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
907       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
908       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
909       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
910       v  += 25;
911     }
912     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
913     idx += 5;
914   }
915   /* backward solve the L^T */
916   for (i=n-1; i>=0; i--){
917     v    = aa + 25*diag[i] - 25;
918     vi   = aj + diag[i] - 1;
919     nz   = diag[i] - ai[i];
920     idt  = 5*i;
921     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
922     while (nz--) {
923       idx   = 5*(*vi--);
924       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
925       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
926       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
927       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
928       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
929       v -= 25;
930     }
931   }
932 
933   /* copy t into x according to permutation */
934   ii = 0;
935   for (i=0; i<n; i++) {
936     ir      = 5*r[i];
937     x[ir]   = t[ii];
938     x[ir+1] = t[ii+1];
939     x[ir+2] = t[ii+2];
940     x[ir+3] = t[ii+3];
941     x[ir+4] = t[ii+4];
942     ii += 5;
943   }
944 
945   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
946   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
947   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
948   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
949   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
950   PetscFunctionReturn(0);
951 }
952 
953 #undef __FUNCT__
954 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
955 PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
956 {
957   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
958   IS             iscol=a->col,isrow=a->row;
959   PetscErrorCode ierr;
960   const PetscInt *r,*c,*rout,*cout;
961   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
962   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
963   MatScalar      *aa=a->a,*v;
964   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
965   PetscScalar    *x,*b,*t;
966 
967   PetscFunctionBegin;
968   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
969   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
970   t  = a->solve_work;
971 
972   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
973   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
974 
975   /* copy the b into temp work space according to permutation */
976   ii = 0;
977   for (i=0; i<n; i++) {
978     ic      = 6*c[i];
979     t[ii]   = b[ic];
980     t[ii+1] = b[ic+1];
981     t[ii+2] = b[ic+2];
982     t[ii+3] = b[ic+3];
983     t[ii+4] = b[ic+4];
984     t[ii+5] = b[ic+5];
985     ii += 6;
986   }
987 
988   /* forward solve the U^T */
989   idx = 0;
990   for (i=0; i<n; i++) {
991 
992     v     = aa + 36*diag[i];
993     /* multiply by the inverse of the block diagonal */
994     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
995     x6    = t[5+idx];
996     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
997     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
998     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
999     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1000     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1001     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1002     v += 36;
1003 
1004     vi    = aj + diag[i] + 1;
1005     nz    = ai[i+1] - diag[i] - 1;
1006     while (nz--) {
1007       oidx = 6*(*vi++);
1008       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1009       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1010       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1011       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1012       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1013       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1014       v  += 36;
1015     }
1016     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1017     t[5+idx] = s6;
1018     idx += 6;
1019   }
1020   /* backward solve the L^T */
1021   for (i=n-1; i>=0; i--){
1022     v    = aa + 36*diag[i] - 36;
1023     vi   = aj + diag[i] - 1;
1024     nz   = diag[i] - ai[i];
1025     idt  = 6*i;
1026     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1027     s6 = t[5+idt];
1028     while (nz--) {
1029       idx   = 6*(*vi--);
1030       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1031       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1032       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1033       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1034       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1035       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1036       v -= 36;
1037     }
1038   }
1039 
1040   /* copy t into x according to permutation */
1041   ii = 0;
1042   for (i=0; i<n; i++) {
1043     ir      = 6*r[i];
1044     x[ir]   = t[ii];
1045     x[ir+1] = t[ii+1];
1046     x[ir+2] = t[ii+2];
1047     x[ir+3] = t[ii+3];
1048     x[ir+4] = t[ii+4];
1049     x[ir+5] = t[ii+5];
1050     ii += 6;
1051   }
1052 
1053   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1054   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1055   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1056   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1057   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1058   PetscFunctionReturn(0);
1059 }
1060 
1061 #undef __FUNCT__
1062 #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1063 PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1064 {
1065   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1066   IS             iscol=a->col,isrow=a->row;
1067   PetscErrorCode ierr;
1068   const PetscInt *r,*c,*rout,*cout;
1069   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1070   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1071   MatScalar      *aa=a->a,*v;
1072   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1073   PetscScalar    *x,*b,*t;
1074 
1075   PetscFunctionBegin;
1076   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1077   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1078   t  = a->solve_work;
1079 
1080   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1081   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1082 
1083   /* copy the b into temp work space according to permutation */
1084   ii = 0;
1085   for (i=0; i<n; i++) {
1086     ic      = 7*c[i];
1087     t[ii]   = b[ic];
1088     t[ii+1] = b[ic+1];
1089     t[ii+2] = b[ic+2];
1090     t[ii+3] = b[ic+3];
1091     t[ii+4] = b[ic+4];
1092     t[ii+5] = b[ic+5];
1093     t[ii+6] = b[ic+6];
1094     ii += 7;
1095   }
1096 
1097   /* forward solve the U^T */
1098   idx = 0;
1099   for (i=0; i<n; i++) {
1100 
1101     v     = aa + 49*diag[i];
1102     /* multiply by the inverse of the block diagonal */
1103     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1104     x6    = t[5+idx]; x7 = t[6+idx];
1105     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1106     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1107     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1108     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1109     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1110     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1111     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1112     v += 49;
1113 
1114     vi    = aj + diag[i] + 1;
1115     nz    = ai[i+1] - diag[i] - 1;
1116     while (nz--) {
1117       oidx = 7*(*vi++);
1118       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1119       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1120       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1121       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1122       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1123       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1124       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1125       v  += 49;
1126     }
1127     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1128     t[5+idx] = s6;t[6+idx] = s7;
1129     idx += 7;
1130   }
1131   /* backward solve the L^T */
1132   for (i=n-1; i>=0; i--){
1133     v    = aa + 49*diag[i] - 49;
1134     vi   = aj + diag[i] - 1;
1135     nz   = diag[i] - ai[i];
1136     idt  = 7*i;
1137     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1138     s6 = t[5+idt];s7 = t[6+idt];
1139     while (nz--) {
1140       idx   = 7*(*vi--);
1141       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1142       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1143       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1144       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1145       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1146       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1147       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1148       v -= 49;
1149     }
1150   }
1151 
1152   /* copy t into x according to permutation */
1153   ii = 0;
1154   for (i=0; i<n; i++) {
1155     ir      = 7*r[i];
1156     x[ir]   = t[ii];
1157     x[ir+1] = t[ii+1];
1158     x[ir+2] = t[ii+2];
1159     x[ir+3] = t[ii+3];
1160     x[ir+4] = t[ii+4];
1161     x[ir+5] = t[ii+5];
1162     x[ir+6] = t[ii+6];
1163     ii += 7;
1164   }
1165 
1166   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1168   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1169   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1171   PetscFunctionReturn(0);
1172 }
1173 
1174 /* ----------------------------------------------------------- */
1175 #undef __FUNCT__
1176 #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1177 PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
1178 {
1179   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1180   IS             iscol=a->col,isrow=a->row;
1181   PetscErrorCode ierr;
1182   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1183   PetscInt       i,n=a->mbs;
1184   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
1185   MatScalar      *aa=a->a,*v;
1186   PetscScalar    *x,*b,*s,*t,*ls;
1187 
1188   PetscFunctionBegin;
1189   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1190   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1191   t  = a->solve_work;
1192 
1193   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1194   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1195 
1196   /* forward solve the lower triangular */
1197   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1198   for (i=1; i<n; i++) {
1199     v   = aa + bs2*ai[i];
1200     vi  = aj + ai[i];
1201     nz  = a->diag[i] - ai[i];
1202     s = t + bs*i;
1203     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
1204     while (nz--) {
1205       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
1206       v += bs2;
1207     }
1208   }
1209   /* backward solve the upper triangular */
1210   ls = a->solve_work + A->cmap->n;
1211   for (i=n-1; i>=0; i--){
1212     v   = aa + bs2*(a->diag[i] + 1);
1213     vi  = aj + a->diag[i] + 1;
1214     nz  = ai[i+1] - a->diag[i] - 1;
1215     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1216     while (nz--) {
1217       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
1218       v += bs2;
1219     }
1220     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1221     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1222   }
1223 
1224   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1225   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1226   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1227   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1228   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1229   PetscFunctionReturn(0);
1230 }
1231 
1232 #undef __FUNCT__
1233 #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1234 PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1235 {
1236   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1237   IS             iscol=a->col,isrow=a->row;
1238   PetscErrorCode ierr;
1239   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1240   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
1241   MatScalar      *aa=a->a,*v;
1242   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1243   PetscScalar    *x,*b,*t;
1244 
1245   PetscFunctionBegin;
1246   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1247   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1248   t  = a->solve_work;
1249 
1250   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1251   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1252 
1253   /* forward solve the lower triangular */
1254   idx    = 7*(*r++);
1255   t[0] = b[idx];   t[1] = b[1+idx];
1256   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1257   t[5] = b[5+idx]; t[6] = b[6+idx];
1258 
1259   for (i=1; i<n; i++) {
1260     v     = aa + 49*ai[i];
1261     vi    = aj + ai[i];
1262     nz    = diag[i] - ai[i];
1263     idx   = 7*(*r++);
1264     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1265     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1266     while (nz--) {
1267       idx   = 7*(*vi++);
1268       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1269       x4    = t[3+idx];x5 = t[4+idx];
1270       x6    = t[5+idx];x7 = t[6+idx];
1271       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1272       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1273       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1274       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1275       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1276       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1277       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1278       v += 49;
1279     }
1280     idx = 7*i;
1281     t[idx]   = s1;t[1+idx] = s2;
1282     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1283     t[5+idx] = s6;t[6+idx] = s7;
1284   }
1285   /* backward solve the upper triangular */
1286   for (i=n-1; i>=0; i--){
1287     v    = aa + 49*diag[i] + 49;
1288     vi   = aj + diag[i] + 1;
1289     nz   = ai[i+1] - diag[i] - 1;
1290     idt  = 7*i;
1291     s1 = t[idt];  s2 = t[1+idt];
1292     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1293     s6 = t[5+idt];s7 = t[6+idt];
1294     while (nz--) {
1295       idx   = 7*(*vi++);
1296       x1    = t[idx];   x2 = t[1+idx];
1297       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1298       x6    = t[5+idx]; x7 = t[6+idx];
1299       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1300       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1301       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1302       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1303       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1304       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1305       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1306       v += 49;
1307     }
1308     idc = 7*(*c--);
1309     v   = aa + 49*diag[i];
1310     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1311                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1312     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1313                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1314     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1315                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1316     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1317                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1318     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1319                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1320     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1321                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1322     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1323                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1324   }
1325 
1326   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1327   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1328   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1329   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1330   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1331   PetscFunctionReturn(0);
1332 }
1333 
1334 #undef __FUNCT__
1335 #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1336 PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
1337 {
1338   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1339   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1340   PetscErrorCode    ierr;
1341   PetscInt          *diag = a->diag,jdx;
1342   const MatScalar   *aa=a->a,*v;
1343   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1344   const PetscScalar *b;
1345 
1346   PetscFunctionBegin;
1347   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1348   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1349   /* forward solve the lower triangular */
1350   idx    = 0;
1351   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1352   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1353   x[6] = b[6+idx];
1354   for (i=1; i<n; i++) {
1355     v     =  aa + 49*ai[i];
1356     vi    =  aj + ai[i];
1357     nz    =  diag[i] - ai[i];
1358     idx   =  7*i;
1359     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1360     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1361     s7  =  b[6+idx];
1362     while (nz--) {
1363       jdx   = 7*(*vi++);
1364       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1365       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1366       x7    = x[6+jdx];
1367       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1368       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1369       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1370       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1371       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1372       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1373       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1374       v += 49;
1375      }
1376     x[idx]   = s1;
1377     x[1+idx] = s2;
1378     x[2+idx] = s3;
1379     x[3+idx] = s4;
1380     x[4+idx] = s5;
1381     x[5+idx] = s6;
1382     x[6+idx] = s7;
1383   }
1384   /* backward solve the upper triangular */
1385   for (i=n-1; i>=0; i--){
1386     v    = aa + 49*diag[i] + 49;
1387     vi   = aj + diag[i] + 1;
1388     nz   = ai[i+1] - diag[i] - 1;
1389     idt  = 7*i;
1390     s1 = x[idt];   s2 = x[1+idt];
1391     s3 = x[2+idt]; s4 = x[3+idt];
1392     s5 = x[4+idt]; s6 = x[5+idt];
1393     s7 = x[6+idt];
1394     while (nz--) {
1395       idx   = 7*(*vi++);
1396       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1397       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1398       x7    = x[6+idx];
1399       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1400       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1401       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1402       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1403       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1404       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1405       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1406       v += 49;
1407     }
1408     v        = aa + 49*diag[i];
1409     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1410                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1411     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1412                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1413     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1414                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1415     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1416                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1417     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1418                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1419     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1420                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1421     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1422                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
1423   }
1424 
1425   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1426   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1427   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1428   PetscFunctionReturn(0);
1429 }
1430 
1431 #undef __FUNCT__
1432 #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1433 PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1434 {
1435   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1436   IS                iscol=a->col,isrow=a->row;
1437   PetscErrorCode    ierr;
1438   const PetscInt    *r,*c,*rout,*cout;
1439   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1440   const MatScalar   *aa=a->a,*v;
1441   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1442   const PetscScalar *b;
1443   PetscFunctionBegin;
1444   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1445   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1446   t  = a->solve_work;
1447 
1448   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1449   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1450 
1451   /* forward solve the lower triangular */
1452   idx    = 6*(*r++);
1453   t[0] = b[idx];   t[1] = b[1+idx];
1454   t[2] = b[2+idx]; t[3] = b[3+idx];
1455   t[4] = b[4+idx]; t[5] = b[5+idx];
1456   for (i=1; i<n; i++) {
1457     v     = aa + 36*ai[i];
1458     vi    = aj + ai[i];
1459     nz    = diag[i] - ai[i];
1460     idx   = 6*(*r++);
1461     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1462     s5  = b[4+idx]; s6 = b[5+idx];
1463     while (nz--) {
1464       idx   = 6*(*vi++);
1465       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1466       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1467       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1468       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1469       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1470       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1471       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1472       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1473       v += 36;
1474     }
1475     idx = 6*i;
1476     t[idx]   = s1;t[1+idx] = s2;
1477     t[2+idx] = s3;t[3+idx] = s4;
1478     t[4+idx] = s5;t[5+idx] = s6;
1479   }
1480   /* backward solve the upper triangular */
1481   for (i=n-1; i>=0; i--){
1482     v    = aa + 36*diag[i] + 36;
1483     vi   = aj + diag[i] + 1;
1484     nz   = ai[i+1] - diag[i] - 1;
1485     idt  = 6*i;
1486     s1 = t[idt];  s2 = t[1+idt];
1487     s3 = t[2+idt];s4 = t[3+idt];
1488     s5 = t[4+idt];s6 = t[5+idt];
1489     while (nz--) {
1490       idx   = 6*(*vi++);
1491       x1    = t[idx];   x2 = t[1+idx];
1492       x3    = t[2+idx]; x4 = t[3+idx];
1493       x5    = t[4+idx]; x6 = t[5+idx];
1494       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1495       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1496       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1497       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1498       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1499       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1500       v += 36;
1501     }
1502     idc = 6*(*c--);
1503     v   = aa + 36*diag[i];
1504     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1505                                  v[18]*s4+v[24]*s5+v[30]*s6;
1506     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1507                                  v[19]*s4+v[25]*s5+v[31]*s6;
1508     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1509                                  v[20]*s4+v[26]*s5+v[32]*s6;
1510     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1511                                  v[21]*s4+v[27]*s5+v[33]*s6;
1512     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1513                                  v[22]*s4+v[28]*s5+v[34]*s6;
1514     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1515                                  v[23]*s4+v[29]*s5+v[35]*s6;
1516   }
1517 
1518   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1519   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1520   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1521   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1522   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1523   PetscFunctionReturn(0);
1524 }
1525 
1526 #undef __FUNCT__
1527 #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1528 PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
1529 {
1530   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1531   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1532   PetscErrorCode    ierr;
1533   PetscInt          *diag = a->diag,jdx;
1534   const MatScalar   *aa=a->a,*v;
1535   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1536   const PetscScalar *b;
1537 
1538   PetscFunctionBegin;
1539   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1540   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1541   /* forward solve the lower triangular */
1542   idx    = 0;
1543   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
1544   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
1545   for (i=1; i<n; i++) {
1546     v     =  aa + 36*ai[i];
1547     vi    =  aj + ai[i];
1548     nz    =  diag[i] - ai[i];
1549     idx   =  6*i;
1550     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1551     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1552     while (nz--) {
1553       jdx   = 6*(*vi++);
1554       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
1555       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1556       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1557       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1558       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1559       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1560       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1561       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1562       v += 36;
1563      }
1564     x[idx]   = s1;
1565     x[1+idx] = s2;
1566     x[2+idx] = s3;
1567     x[3+idx] = s4;
1568     x[4+idx] = s5;
1569     x[5+idx] = s6;
1570   }
1571   /* backward solve the upper triangular */
1572   for (i=n-1; i>=0; i--){
1573     v    = aa + 36*diag[i] + 36;
1574     vi   = aj + diag[i] + 1;
1575     nz   = ai[i+1] - diag[i] - 1;
1576     idt  = 6*i;
1577     s1 = x[idt];   s2 = x[1+idt];
1578     s3 = x[2+idt]; s4 = x[3+idt];
1579     s5 = x[4+idt]; s6 = x[5+idt];
1580     while (nz--) {
1581       idx   = 6*(*vi++);
1582       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
1583       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1584       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1585       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1586       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1587       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1588       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1589       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1590       v += 36;
1591     }
1592     v        = aa + 36*diag[i];
1593     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1594     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1595     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1596     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1597     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1598     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1599   }
1600 
1601   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1602   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1603   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1604   PetscFunctionReturn(0);
1605 }
1606 
1607 #undef __FUNCT__
1608 #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1609 PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1610 {
1611   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1612   IS                iscol=a->col,isrow=a->row;
1613   PetscErrorCode    ierr;
1614   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1615   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1616   const MatScalar   *aa=a->a,*v;
1617   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1618   const PetscScalar *b;
1619 
1620   PetscFunctionBegin;
1621   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1622   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1623   t  = a->solve_work;
1624 
1625   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1626   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1627 
1628   /* forward solve the lower triangular */
1629   idx    = 5*(*r++);
1630   t[0] = b[idx];   t[1] = b[1+idx];
1631   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1632   for (i=1; i<n; i++) {
1633     v     = aa + 25*ai[i];
1634     vi    = aj + ai[i];
1635     nz    = diag[i] - ai[i];
1636     idx   = 5*(*r++);
1637     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1638     s5  = b[4+idx];
1639     while (nz--) {
1640       idx   = 5*(*vi++);
1641       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1642       x4    = t[3+idx];x5 = t[4+idx];
1643       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1644       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1645       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1646       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1647       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1648       v += 25;
1649     }
1650     idx = 5*i;
1651     t[idx]   = s1;t[1+idx] = s2;
1652     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1653   }
1654   /* backward solve the upper triangular */
1655   for (i=n-1; i>=0; i--){
1656     v    = aa + 25*diag[i] + 25;
1657     vi   = aj + diag[i] + 1;
1658     nz   = ai[i+1] - diag[i] - 1;
1659     idt  = 5*i;
1660     s1 = t[idt];  s2 = t[1+idt];
1661     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1662     while (nz--) {
1663       idx   = 5*(*vi++);
1664       x1    = t[idx];   x2 = t[1+idx];
1665       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1666       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1667       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1668       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1669       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1670       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1671       v += 25;
1672     }
1673     idc = 5*(*c--);
1674     v   = aa + 25*diag[i];
1675     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1676                                  v[15]*s4+v[20]*s5;
1677     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1678                                  v[16]*s4+v[21]*s5;
1679     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1680                                  v[17]*s4+v[22]*s5;
1681     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1682                                  v[18]*s4+v[23]*s5;
1683     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1684                                  v[19]*s4+v[24]*s5;
1685   }
1686 
1687   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1688   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1689   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1690   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1691   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1692   PetscFunctionReturn(0);
1693 }
1694 
1695 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1696 {
1697   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1698   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1699   PetscErrorCode    ierr;
1700   PetscInt          jdx;
1701   const MatScalar   *aa=a->a,*v;
1702   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1703   const PetscScalar *b;
1704 
1705   PetscFunctionBegin;
1706   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1707   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1708   /* forward solve the lower triangular */
1709   idx    = 0;
1710   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1711   for (i=1; i<n; i++) {
1712     v   = aa + 25*ai[i];
1713     vi  = aj + ai[i];
1714     nz  = ai[i+1] - ai[i];
1715     idx = 5*i;
1716     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1717     while (nz--) {
1718       jdx   = 5*(*vi++);
1719       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1720       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1721       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1722       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1723       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1724       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1725       v    += 25;
1726     }
1727     x[idx]   = s1;
1728     x[1+idx] = s2;
1729     x[2+idx] = s3;
1730     x[3+idx] = s4;
1731     x[4+idx] = s5;
1732   }
1733 
1734   /* backward solve the upper triangular */
1735   for (i=n-1; i>=0; i--){
1736     v   = aa + 25*ai[2*n-i];
1737     vi  = aj + ai[2*n-i];
1738     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1739     idt = 5*i;
1740     s1 = x[idt];  s2 = x[1+idt];
1741     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1742     while (nz--) {
1743       idx   = 5*(*vi++);
1744       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1745       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1746       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1747       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1748       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1749       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1750       v    += 25;
1751     }
1752     /* x = inv_diagonal*x */
1753     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1754     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1755     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1756     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1757     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1758   }
1759 
1760   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1761   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1762   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1763   PetscFunctionReturn(0);
1764 }
1765 
1766 #undef __FUNCT__
1767 #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1768 PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
1769 {
1770   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1771   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1772   PetscErrorCode    ierr;
1773   PetscInt          *diag = a->diag,jdx;
1774   const MatScalar   *aa=a->a,*v;
1775   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1776   const PetscScalar *b;
1777 
1778   PetscFunctionBegin;
1779   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1780   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1781   /* forward solve the lower triangular */
1782   idx    = 0;
1783   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1784   for (i=1; i<n; i++) {
1785     v     =  aa + 25*ai[i];
1786     vi    =  aj + ai[i];
1787     nz    =  diag[i] - ai[i];
1788     idx   =  5*i;
1789     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1790     while (nz--) {
1791       jdx   = 5*(*vi++);
1792       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1793       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1794       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1795       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1796       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1797       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1798       v    += 25;
1799     }
1800     x[idx]   = s1;
1801     x[1+idx] = s2;
1802     x[2+idx] = s3;
1803     x[3+idx] = s4;
1804     x[4+idx] = s5;
1805   }
1806   /* backward solve the upper triangular */
1807   for (i=n-1; i>=0; i--){
1808     v    = aa + 25*diag[i] + 25;
1809     vi   = aj + diag[i] + 1;
1810     nz   = ai[i+1] - diag[i] - 1;
1811     idt  = 5*i;
1812     s1 = x[idt];  s2 = x[1+idt];
1813     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1814     while (nz--) {
1815       idx   = 5*(*vi++);
1816       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1817       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1818       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1819       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1820       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1821       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1822       v    += 25;
1823     }
1824     v        = aa + 25*diag[i];
1825     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1826     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1827     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1828     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1829     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1830   }
1831 
1832   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1833   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1834   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1835   PetscFunctionReturn(0);
1836 }
1837 
1838 #undef __FUNCT__
1839 #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1840 PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1841 {
1842   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1843   IS                iscol=a->col,isrow=a->row;
1844   PetscErrorCode    ierr;
1845   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1846   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1847   const MatScalar   *aa=a->a,*v;
1848   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1849   const PetscScalar *b;
1850 
1851   PetscFunctionBegin;
1852   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1853   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1854   t  = a->solve_work;
1855 
1856   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1857   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1858 
1859   /* forward solve the lower triangular */
1860   idx    = 4*(*r++);
1861   t[0] = b[idx];   t[1] = b[1+idx];
1862   t[2] = b[2+idx]; t[3] = b[3+idx];
1863   for (i=1; i<n; i++) {
1864     v     = aa + 16*ai[i];
1865     vi    = aj + ai[i];
1866     nz    = diag[i] - ai[i];
1867     idx   = 4*(*r++);
1868     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1869     while (nz--) {
1870       idx   = 4*(*vi++);
1871       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1872       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1873       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1874       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1875       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1876       v    += 16;
1877     }
1878     idx        = 4*i;
1879     t[idx]   = s1;t[1+idx] = s2;
1880     t[2+idx] = s3;t[3+idx] = s4;
1881   }
1882   /* backward solve the upper triangular */
1883   for (i=n-1; i>=0; i--){
1884     v    = aa + 16*diag[i] + 16;
1885     vi   = aj + diag[i] + 1;
1886     nz   = ai[i+1] - diag[i] - 1;
1887     idt  = 4*i;
1888     s1 = t[idt];  s2 = t[1+idt];
1889     s3 = t[2+idt];s4 = t[3+idt];
1890     while (nz--) {
1891       idx   = 4*(*vi++);
1892       x1    = t[idx];   x2 = t[1+idx];
1893       x3    = t[2+idx]; x4 = t[3+idx];
1894       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1895       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1896       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1897       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1898       v += 16;
1899     }
1900     idc      = 4*(*c--);
1901     v        = aa + 16*diag[i];
1902     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1903     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1904     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1905     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1906   }
1907 
1908   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1909   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1910   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1911   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1912   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1913   PetscFunctionReturn(0);
1914 }
1915 
1916 #undef __FUNCT__
1917 #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1918 PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1919 {
1920   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1921   IS                iscol=a->col,isrow=a->row;
1922   PetscErrorCode    ierr;
1923   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1924   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1925   const MatScalar   *aa=a->a,*v;
1926   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1927   PetscScalar       *x;
1928   const PetscScalar *b;
1929 
1930   PetscFunctionBegin;
1931   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1932   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1933   t  = (MatScalar *)a->solve_work;
1934 
1935   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1936   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1937 
1938   /* forward solve the lower triangular */
1939   idx    = 4*(*r++);
1940   t[0] = (MatScalar)b[idx];
1941   t[1] = (MatScalar)b[1+idx];
1942   t[2] = (MatScalar)b[2+idx];
1943   t[3] = (MatScalar)b[3+idx];
1944   for (i=1; i<n; i++) {
1945     v     = aa + 16*ai[i];
1946     vi    = aj + ai[i];
1947     nz    = diag[i] - ai[i];
1948     idx   = 4*(*r++);
1949     s1 = (MatScalar)b[idx];
1950     s2 = (MatScalar)b[1+idx];
1951     s3 = (MatScalar)b[2+idx];
1952     s4 = (MatScalar)b[3+idx];
1953     while (nz--) {
1954       idx   = 4*(*vi++);
1955       x1  = t[idx];
1956       x2  = t[1+idx];
1957       x3  = t[2+idx];
1958       x4  = t[3+idx];
1959       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1960       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1961       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1962       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1963       v    += 16;
1964     }
1965     idx        = 4*i;
1966     t[idx]   = s1;
1967     t[1+idx] = s2;
1968     t[2+idx] = s3;
1969     t[3+idx] = s4;
1970   }
1971   /* backward solve the upper triangular */
1972   for (i=n-1; i>=0; i--){
1973     v    = aa + 16*diag[i] + 16;
1974     vi   = aj + diag[i] + 1;
1975     nz   = ai[i+1] - diag[i] - 1;
1976     idt  = 4*i;
1977     s1 = t[idt];
1978     s2 = t[1+idt];
1979     s3 = t[2+idt];
1980     s4 = t[3+idt];
1981     while (nz--) {
1982       idx   = 4*(*vi++);
1983       x1  = t[idx];
1984       x2  = t[1+idx];
1985       x3  = t[2+idx];
1986       x4  = t[3+idx];
1987       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1988       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1989       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1990       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1991       v += 16;
1992     }
1993     idc      = 4*(*c--);
1994     v        = aa + 16*diag[i];
1995     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1996     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1997     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1998     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1999     x[idc]   = (PetscScalar)t[idt];
2000     x[1+idc] = (PetscScalar)t[1+idt];
2001     x[2+idc] = (PetscScalar)t[2+idt];
2002     x[3+idc] = (PetscScalar)t[3+idt];
2003  }
2004 
2005   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2006   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2007   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2008   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2009   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2010   PetscFunctionReturn(0);
2011 }
2012 
2013 #if defined (PETSC_HAVE_SSE)
2014 
2015 #include PETSC_HAVE_SSE
2016 
2017 #undef __FUNCT__
2018 #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2019 PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
2020 {
2021   /*
2022      Note: This code uses demotion of double
2023      to float when performing the mixed-mode computation.
2024      This may not be numerically reasonable for all applications.
2025   */
2026   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2027   IS             iscol=a->col,isrow=a->row;
2028   PetscErrorCode ierr;
2029   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
2030   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
2031   MatScalar      *aa=a->a,*v;
2032   PetscScalar    *x,*b,*t;
2033 
2034   /* Make space in temp stack for 16 Byte Aligned arrays */
2035   float           ssealignedspace[11],*tmps,*tmpx;
2036   unsigned long   offset;
2037 
2038   PetscFunctionBegin;
2039   SSE_SCOPE_BEGIN;
2040 
2041     offset = (unsigned long)ssealignedspace % 16;
2042     if (offset) offset = (16 - offset)/4;
2043     tmps = &ssealignedspace[offset];
2044     tmpx = &ssealignedspace[offset+4];
2045     PREFETCH_NTA(aa+16*ai[1]);
2046 
2047     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2048     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2049     t  = a->solve_work;
2050 
2051     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2052     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2053 
2054     /* forward solve the lower triangular */
2055     idx  = 4*(*r++);
2056     t[0] = b[idx];   t[1] = b[1+idx];
2057     t[2] = b[2+idx]; t[3] = b[3+idx];
2058     v    =  aa + 16*ai[1];
2059 
2060     for (i=1; i<n;) {
2061       PREFETCH_NTA(&v[8]);
2062       vi   =  aj      + ai[i];
2063       nz   =  diag[i] - ai[i];
2064       idx  =  4*(*r++);
2065 
2066       /* Demote sum from double to float */
2067       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
2068       LOAD_PS(tmps,XMM7);
2069 
2070       while (nz--) {
2071         PREFETCH_NTA(&v[16]);
2072         idx = 4*(*vi++);
2073 
2074         /* Demote solution (so far) from double to float */
2075         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
2076 
2077         /* 4x4 Matrix-Vector product with negative accumulation: */
2078         SSE_INLINE_BEGIN_2(tmpx,v)
2079           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2080 
2081           /* First Column */
2082           SSE_COPY_PS(XMM0,XMM6)
2083           SSE_SHUFFLE(XMM0,XMM0,0x00)
2084           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2085           SSE_SUB_PS(XMM7,XMM0)
2086 
2087           /* Second Column */
2088           SSE_COPY_PS(XMM1,XMM6)
2089           SSE_SHUFFLE(XMM1,XMM1,0x55)
2090           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2091           SSE_SUB_PS(XMM7,XMM1)
2092 
2093           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2094 
2095           /* Third Column */
2096           SSE_COPY_PS(XMM2,XMM6)
2097           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2098           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2099           SSE_SUB_PS(XMM7,XMM2)
2100 
2101           /* Fourth Column */
2102           SSE_COPY_PS(XMM3,XMM6)
2103           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2104           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2105           SSE_SUB_PS(XMM7,XMM3)
2106         SSE_INLINE_END_2
2107 
2108         v  += 16;
2109       }
2110       idx = 4*i;
2111       v   = aa + 16*ai[++i];
2112       PREFETCH_NTA(v);
2113       STORE_PS(tmps,XMM7);
2114 
2115       /* Promote result from float to double */
2116       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
2117     }
2118     /* backward solve the upper triangular */
2119     idt  = 4*(n-1);
2120     ai16 = 16*diag[n-1];
2121     v    = aa + ai16 + 16;
2122     for (i=n-1; i>=0;){
2123       PREFETCH_NTA(&v[8]);
2124       vi = aj + diag[i] + 1;
2125       nz = ai[i+1] - diag[i] - 1;
2126 
2127       /* Demote accumulator from double to float */
2128       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
2129       LOAD_PS(tmps,XMM7);
2130 
2131       while (nz--) {
2132         PREFETCH_NTA(&v[16]);
2133         idx = 4*(*vi++);
2134 
2135         /* Demote solution (so far) from double to float */
2136         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
2137 
2138         /* 4x4 Matrix-Vector Product with negative accumulation: */
2139         SSE_INLINE_BEGIN_2(tmpx,v)
2140           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2141 
2142           /* First Column */
2143           SSE_COPY_PS(XMM0,XMM6)
2144           SSE_SHUFFLE(XMM0,XMM0,0x00)
2145           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2146           SSE_SUB_PS(XMM7,XMM0)
2147 
2148           /* Second Column */
2149           SSE_COPY_PS(XMM1,XMM6)
2150           SSE_SHUFFLE(XMM1,XMM1,0x55)
2151           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2152           SSE_SUB_PS(XMM7,XMM1)
2153 
2154           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2155 
2156           /* Third Column */
2157           SSE_COPY_PS(XMM2,XMM6)
2158           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2159           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2160           SSE_SUB_PS(XMM7,XMM2)
2161 
2162           /* Fourth Column */
2163           SSE_COPY_PS(XMM3,XMM6)
2164           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2165           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2166           SSE_SUB_PS(XMM7,XMM3)
2167         SSE_INLINE_END_2
2168         v  += 16;
2169       }
2170       v    = aa + ai16;
2171       ai16 = 16*diag[--i];
2172       PREFETCH_NTA(aa+ai16+16);
2173       /*
2174          Scale the result by the diagonal 4x4 block,
2175          which was inverted as part of the factorization
2176       */
2177       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
2178         /* First Column */
2179         SSE_COPY_PS(XMM0,XMM7)
2180         SSE_SHUFFLE(XMM0,XMM0,0x00)
2181         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2182 
2183         /* Second Column */
2184         SSE_COPY_PS(XMM1,XMM7)
2185         SSE_SHUFFLE(XMM1,XMM1,0x55)
2186         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2187         SSE_ADD_PS(XMM0,XMM1)
2188 
2189         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2190 
2191         /* Third Column */
2192         SSE_COPY_PS(XMM2,XMM7)
2193         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2194         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2195         SSE_ADD_PS(XMM0,XMM2)
2196 
2197         /* Fourth Column */
2198         SSE_COPY_PS(XMM3,XMM7)
2199         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2200         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2201         SSE_ADD_PS(XMM0,XMM3)
2202 
2203         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2204       SSE_INLINE_END_3
2205 
2206       /* Promote solution from float to double */
2207       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
2208 
2209       /* Apply reordering to t and stream into x.    */
2210       /* This way, x doesn't pollute the cache.      */
2211       /* Be careful with size: 2 doubles = 4 floats! */
2212       idc  = 4*(*c--);
2213       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
2214         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
2215         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
2216         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
2217         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
2218         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
2219         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
2220       SSE_INLINE_END_2
2221       v    = aa + ai16 + 16;
2222       idt -= 4;
2223     }
2224 
2225     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2226     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2227     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2228     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2229     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2230   SSE_SCOPE_END;
2231   PetscFunctionReturn(0);
2232 }
2233 
2234 #endif
2235 
2236 
2237 /*
2238       Special case where the matrix was ILU(0) factored in the natural
2239    ordering. This eliminates the need for the column and row permutation.
2240 */
2241 #undef __FUNCT__
2242 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2243 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
2244 {
2245   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2246   PetscInt          n=a->mbs;
2247   const PetscInt    *ai=a->i,*aj=a->j;
2248   PetscErrorCode    ierr;
2249   const PetscInt    *diag = a->diag;
2250   const MatScalar   *aa=a->a;
2251   PetscScalar       *x;
2252   const PetscScalar *b;
2253 
2254   PetscFunctionBegin;
2255   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2256   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2257 
2258 #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
2259   {
2260     static PetscScalar w[2000]; /* very BAD need to fix */
2261     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
2262   }
2263 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
2264   {
2265     static PetscScalar w[2000]; /* very BAD need to fix */
2266     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
2267   }
2268 #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
2269   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2270 #else
2271   {
2272     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2273     const MatScalar *v;
2274     PetscInt        jdx,idt,idx,nz,i,ai16;
2275     const PetscInt  *vi;
2276 
2277   /* forward solve the lower triangular */
2278   idx    = 0;
2279   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
2280   for (i=1; i<n; i++) {
2281     v     =  aa      + 16*ai[i];
2282     vi    =  aj      + ai[i];
2283     nz    =  diag[i] - ai[i];
2284     idx   +=  4;
2285     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2286     while (nz--) {
2287       jdx   = 4*(*vi++);
2288       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2289       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2290       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2291       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2292       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2293       v    += 16;
2294     }
2295     x[idx]   = s1;
2296     x[1+idx] = s2;
2297     x[2+idx] = s3;
2298     x[3+idx] = s4;
2299   }
2300   /* backward solve the upper triangular */
2301   idt = 4*(n-1);
2302   for (i=n-1; i>=0; i--){
2303     ai16 = 16*diag[i];
2304     v    = aa + ai16 + 16;
2305     vi   = aj + diag[i] + 1;
2306     nz   = ai[i+1] - diag[i] - 1;
2307     s1 = x[idt];  s2 = x[1+idt];
2308     s3 = x[2+idt];s4 = x[3+idt];
2309     while (nz--) {
2310       idx   = 4*(*vi++);
2311       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2312       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2313       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2314       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2315       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2316       v    += 16;
2317     }
2318     v        = aa + ai16;
2319     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2320     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2321     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2322     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2323     idt -= 4;
2324   }
2325   }
2326 #endif
2327 
2328   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2329   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2330   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2331   PetscFunctionReturn(0);
2332 }
2333 
2334 #undef __FUNCT__
2335 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2336 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2337 {
2338   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2339   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2340   PetscErrorCode ierr;
2341   PetscInt       *diag = a->diag;
2342   MatScalar      *aa=a->a;
2343   PetscScalar    *x,*b;
2344 
2345   PetscFunctionBegin;
2346   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2347   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2348 
2349   {
2350     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2351     MatScalar  *v,*t=(MatScalar *)x;
2352     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2353 
2354     /* forward solve the lower triangular */
2355     idx  = 0;
2356     t[0] = (MatScalar)b[0];
2357     t[1] = (MatScalar)b[1];
2358     t[2] = (MatScalar)b[2];
2359     t[3] = (MatScalar)b[3];
2360     for (i=1; i<n; i++) {
2361       v     =  aa      + 16*ai[i];
2362       vi    =  aj      + ai[i];
2363       nz    =  diag[i] - ai[i];
2364       idx   +=  4;
2365       s1 = (MatScalar)b[idx];
2366       s2 = (MatScalar)b[1+idx];
2367       s3 = (MatScalar)b[2+idx];
2368       s4 = (MatScalar)b[3+idx];
2369       while (nz--) {
2370         jdx = 4*(*vi++);
2371         x1  = t[jdx];
2372         x2  = t[1+jdx];
2373         x3  = t[2+jdx];
2374         x4  = t[3+jdx];
2375         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2376         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2377         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2378         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2379         v    += 16;
2380       }
2381       t[idx]   = s1;
2382       t[1+idx] = s2;
2383       t[2+idx] = s3;
2384       t[3+idx] = s4;
2385     }
2386     /* backward solve the upper triangular */
2387     idt = 4*(n-1);
2388     for (i=n-1; i>=0; i--){
2389       ai16 = 16*diag[i];
2390       v    = aa + ai16 + 16;
2391       vi   = aj + diag[i] + 1;
2392       nz   = ai[i+1] - diag[i] - 1;
2393       s1   = t[idt];
2394       s2   = t[1+idt];
2395       s3   = t[2+idt];
2396       s4   = t[3+idt];
2397       while (nz--) {
2398         idx = 4*(*vi++);
2399         x1  = (MatScalar)x[idx];
2400         x2  = (MatScalar)x[1+idx];
2401         x3  = (MatScalar)x[2+idx];
2402         x4  = (MatScalar)x[3+idx];
2403         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2404         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2405         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2406         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2407         v    += 16;
2408       }
2409       v        = aa + ai16;
2410       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2411       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2412       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2413       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2414       idt -= 4;
2415     }
2416   }
2417 
2418   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2419   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2420   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2421   PetscFunctionReturn(0);
2422 }
2423 
2424 #if defined (PETSC_HAVE_SSE)
2425 
2426 #include PETSC_HAVE_SSE
2427 #undef __FUNCT__
2428 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2429 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
2430 {
2431   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2432   unsigned short *aj=(unsigned short *)a->j;
2433   PetscErrorCode ierr;
2434   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2435   MatScalar      *aa=a->a;
2436   PetscScalar    *x,*b;
2437 
2438   PetscFunctionBegin;
2439   SSE_SCOPE_BEGIN;
2440   /*
2441      Note: This code currently uses demotion of double
2442      to float when performing the mixed-mode computation.
2443      This may not be numerically reasonable for all applications.
2444   */
2445   PREFETCH_NTA(aa+16*ai[1]);
2446 
2447   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2448   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2449   {
2450     /* x will first be computed in single precision then promoted inplace to double */
2451     MatScalar      *v,*t=(MatScalar *)x;
2452     int            nz,i,idt,ai16;
2453     unsigned int   jdx,idx;
2454     unsigned short *vi;
2455     /* Forward solve the lower triangular factor. */
2456 
2457     /* First block is the identity. */
2458     idx  = 0;
2459     CONVERT_DOUBLE4_FLOAT4(t,b);
2460     v    =  aa + 16*((unsigned int)ai[1]);
2461 
2462     for (i=1; i<n;) {
2463       PREFETCH_NTA(&v[8]);
2464       vi   =  aj      + ai[i];
2465       nz   =  diag[i] - ai[i];
2466       idx +=  4;
2467 
2468       /* Demote RHS from double to float. */
2469       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2470       LOAD_PS(&t[idx],XMM7);
2471 
2472       while (nz--) {
2473         PREFETCH_NTA(&v[16]);
2474         jdx = 4*((unsigned int)(*vi++));
2475 
2476         /* 4x4 Matrix-Vector product with negative accumulation: */
2477         SSE_INLINE_BEGIN_2(&t[jdx],v)
2478           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2479 
2480           /* First Column */
2481           SSE_COPY_PS(XMM0,XMM6)
2482           SSE_SHUFFLE(XMM0,XMM0,0x00)
2483           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2484           SSE_SUB_PS(XMM7,XMM0)
2485 
2486           /* Second Column */
2487           SSE_COPY_PS(XMM1,XMM6)
2488           SSE_SHUFFLE(XMM1,XMM1,0x55)
2489           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2490           SSE_SUB_PS(XMM7,XMM1)
2491 
2492           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2493 
2494           /* Third Column */
2495           SSE_COPY_PS(XMM2,XMM6)
2496           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2497           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2498           SSE_SUB_PS(XMM7,XMM2)
2499 
2500           /* Fourth Column */
2501           SSE_COPY_PS(XMM3,XMM6)
2502           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2503           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2504           SSE_SUB_PS(XMM7,XMM3)
2505         SSE_INLINE_END_2
2506 
2507         v  += 16;
2508       }
2509       v    =  aa + 16*ai[++i];
2510       PREFETCH_NTA(v);
2511       STORE_PS(&t[idx],XMM7);
2512     }
2513 
2514     /* Backward solve the upper triangular factor.*/
2515 
2516     idt  = 4*(n-1);
2517     ai16 = 16*diag[n-1];
2518     v    = aa + ai16 + 16;
2519     for (i=n-1; i>=0;){
2520       PREFETCH_NTA(&v[8]);
2521       vi = aj + diag[i] + 1;
2522       nz = ai[i+1] - diag[i] - 1;
2523 
2524       LOAD_PS(&t[idt],XMM7);
2525 
2526       while (nz--) {
2527         PREFETCH_NTA(&v[16]);
2528         idx = 4*((unsigned int)(*vi++));
2529 
2530         /* 4x4 Matrix-Vector Product with negative accumulation: */
2531         SSE_INLINE_BEGIN_2(&t[idx],v)
2532           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2533 
2534           /* First Column */
2535           SSE_COPY_PS(XMM0,XMM6)
2536           SSE_SHUFFLE(XMM0,XMM0,0x00)
2537           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2538           SSE_SUB_PS(XMM7,XMM0)
2539 
2540           /* Second Column */
2541           SSE_COPY_PS(XMM1,XMM6)
2542           SSE_SHUFFLE(XMM1,XMM1,0x55)
2543           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2544           SSE_SUB_PS(XMM7,XMM1)
2545 
2546           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2547 
2548           /* Third Column */
2549           SSE_COPY_PS(XMM2,XMM6)
2550           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2551           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2552           SSE_SUB_PS(XMM7,XMM2)
2553 
2554           /* Fourth Column */
2555           SSE_COPY_PS(XMM3,XMM6)
2556           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2557           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2558           SSE_SUB_PS(XMM7,XMM3)
2559         SSE_INLINE_END_2
2560         v  += 16;
2561       }
2562       v    = aa + ai16;
2563       ai16 = 16*diag[--i];
2564       PREFETCH_NTA(aa+ai16+16);
2565       /*
2566          Scale the result by the diagonal 4x4 block,
2567          which was inverted as part of the factorization
2568       */
2569       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2570         /* First Column */
2571         SSE_COPY_PS(XMM0,XMM7)
2572         SSE_SHUFFLE(XMM0,XMM0,0x00)
2573         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2574 
2575         /* Second Column */
2576         SSE_COPY_PS(XMM1,XMM7)
2577         SSE_SHUFFLE(XMM1,XMM1,0x55)
2578         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2579         SSE_ADD_PS(XMM0,XMM1)
2580 
2581         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2582 
2583         /* Third Column */
2584         SSE_COPY_PS(XMM2,XMM7)
2585         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2586         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2587         SSE_ADD_PS(XMM0,XMM2)
2588 
2589         /* Fourth Column */
2590         SSE_COPY_PS(XMM3,XMM7)
2591         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2592         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2593         SSE_ADD_PS(XMM0,XMM3)
2594 
2595         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2596       SSE_INLINE_END_3
2597 
2598       v    = aa + ai16 + 16;
2599       idt -= 4;
2600     }
2601 
2602     /* Convert t from single precision back to double precision (inplace)*/
2603     idt = 4*(n-1);
2604     for (i=n-1;i>=0;i--) {
2605       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2606       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2607       PetscScalar *xtemp=&x[idt];
2608       MatScalar   *ttemp=&t[idt];
2609       xtemp[3] = (PetscScalar)ttemp[3];
2610       xtemp[2] = (PetscScalar)ttemp[2];
2611       xtemp[1] = (PetscScalar)ttemp[1];
2612       xtemp[0] = (PetscScalar)ttemp[0];
2613       idt -= 4;
2614     }
2615 
2616   } /* End of artificial scope. */
2617   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2618   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2619   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2620   SSE_SCOPE_END;
2621   PetscFunctionReturn(0);
2622 }
2623 
2624 #undef __FUNCT__
2625 #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2626 PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
2627 {
2628   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2629   int            *aj=a->j;
2630   PetscErrorCode ierr;
2631   int            *ai=a->i,n=a->mbs,*diag = a->diag;
2632   MatScalar      *aa=a->a;
2633   PetscScalar    *x,*b;
2634 
2635   PetscFunctionBegin;
2636   SSE_SCOPE_BEGIN;
2637   /*
2638      Note: This code currently uses demotion of double
2639      to float when performing the mixed-mode computation.
2640      This may not be numerically reasonable for all applications.
2641   */
2642   PREFETCH_NTA(aa+16*ai[1]);
2643 
2644   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2645   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2646   {
2647     /* x will first be computed in single precision then promoted inplace to double */
2648     MatScalar *v,*t=(MatScalar *)x;
2649     int       nz,i,idt,ai16;
2650     int       jdx,idx;
2651     int       *vi;
2652     /* Forward solve the lower triangular factor. */
2653 
2654     /* First block is the identity. */
2655     idx  = 0;
2656     CONVERT_DOUBLE4_FLOAT4(t,b);
2657     v    =  aa + 16*ai[1];
2658 
2659     for (i=1; i<n;) {
2660       PREFETCH_NTA(&v[8]);
2661       vi   =  aj      + ai[i];
2662       nz   =  diag[i] - ai[i];
2663       idx +=  4;
2664 
2665       /* Demote RHS from double to float. */
2666       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2667       LOAD_PS(&t[idx],XMM7);
2668 
2669       while (nz--) {
2670         PREFETCH_NTA(&v[16]);
2671         jdx = 4*(*vi++);
2672 /*          jdx = *vi++; */
2673 
2674         /* 4x4 Matrix-Vector product with negative accumulation: */
2675         SSE_INLINE_BEGIN_2(&t[jdx],v)
2676           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2677 
2678           /* First Column */
2679           SSE_COPY_PS(XMM0,XMM6)
2680           SSE_SHUFFLE(XMM0,XMM0,0x00)
2681           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2682           SSE_SUB_PS(XMM7,XMM0)
2683 
2684           /* Second Column */
2685           SSE_COPY_PS(XMM1,XMM6)
2686           SSE_SHUFFLE(XMM1,XMM1,0x55)
2687           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2688           SSE_SUB_PS(XMM7,XMM1)
2689 
2690           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2691 
2692           /* Third Column */
2693           SSE_COPY_PS(XMM2,XMM6)
2694           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2695           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2696           SSE_SUB_PS(XMM7,XMM2)
2697 
2698           /* Fourth Column */
2699           SSE_COPY_PS(XMM3,XMM6)
2700           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2701           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2702           SSE_SUB_PS(XMM7,XMM3)
2703         SSE_INLINE_END_2
2704 
2705         v  += 16;
2706       }
2707       v    =  aa + 16*ai[++i];
2708       PREFETCH_NTA(v);
2709       STORE_PS(&t[idx],XMM7);
2710     }
2711 
2712     /* Backward solve the upper triangular factor.*/
2713 
2714     idt  = 4*(n-1);
2715     ai16 = 16*diag[n-1];
2716     v    = aa + ai16 + 16;
2717     for (i=n-1; i>=0;){
2718       PREFETCH_NTA(&v[8]);
2719       vi = aj + diag[i] + 1;
2720       nz = ai[i+1] - diag[i] - 1;
2721 
2722       LOAD_PS(&t[idt],XMM7);
2723 
2724       while (nz--) {
2725         PREFETCH_NTA(&v[16]);
2726         idx = 4*(*vi++);
2727 /*          idx = *vi++; */
2728 
2729         /* 4x4 Matrix-Vector Product with negative accumulation: */
2730         SSE_INLINE_BEGIN_2(&t[idx],v)
2731           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
2732 
2733           /* First Column */
2734           SSE_COPY_PS(XMM0,XMM6)
2735           SSE_SHUFFLE(XMM0,XMM0,0x00)
2736           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
2737           SSE_SUB_PS(XMM7,XMM0)
2738 
2739           /* Second Column */
2740           SSE_COPY_PS(XMM1,XMM6)
2741           SSE_SHUFFLE(XMM1,XMM1,0x55)
2742           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
2743           SSE_SUB_PS(XMM7,XMM1)
2744 
2745           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
2746 
2747           /* Third Column */
2748           SSE_COPY_PS(XMM2,XMM6)
2749           SSE_SHUFFLE(XMM2,XMM2,0xAA)
2750           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
2751           SSE_SUB_PS(XMM7,XMM2)
2752 
2753           /* Fourth Column */
2754           SSE_COPY_PS(XMM3,XMM6)
2755           SSE_SHUFFLE(XMM3,XMM3,0xFF)
2756           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
2757           SSE_SUB_PS(XMM7,XMM3)
2758         SSE_INLINE_END_2
2759         v  += 16;
2760       }
2761       v    = aa + ai16;
2762       ai16 = 16*diag[--i];
2763       PREFETCH_NTA(aa+ai16+16);
2764       /*
2765          Scale the result by the diagonal 4x4 block,
2766          which was inverted as part of the factorization
2767       */
2768       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
2769         /* First Column */
2770         SSE_COPY_PS(XMM0,XMM7)
2771         SSE_SHUFFLE(XMM0,XMM0,0x00)
2772         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
2773 
2774         /* Second Column */
2775         SSE_COPY_PS(XMM1,XMM7)
2776         SSE_SHUFFLE(XMM1,XMM1,0x55)
2777         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
2778         SSE_ADD_PS(XMM0,XMM1)
2779 
2780         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
2781 
2782         /* Third Column */
2783         SSE_COPY_PS(XMM2,XMM7)
2784         SSE_SHUFFLE(XMM2,XMM2,0xAA)
2785         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
2786         SSE_ADD_PS(XMM0,XMM2)
2787 
2788         /* Fourth Column */
2789         SSE_COPY_PS(XMM3,XMM7)
2790         SSE_SHUFFLE(XMM3,XMM3,0xFF)
2791         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
2792         SSE_ADD_PS(XMM0,XMM3)
2793 
2794         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
2795       SSE_INLINE_END_3
2796 
2797       v    = aa + ai16 + 16;
2798       idt -= 4;
2799     }
2800 
2801     /* Convert t from single precision back to double precision (inplace)*/
2802     idt = 4*(n-1);
2803     for (i=n-1;i>=0;i--) {
2804       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2805       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2806       PetscScalar *xtemp=&x[idt];
2807       MatScalar   *ttemp=&t[idt];
2808       xtemp[3] = (PetscScalar)ttemp[3];
2809       xtemp[2] = (PetscScalar)ttemp[2];
2810       xtemp[1] = (PetscScalar)ttemp[1];
2811       xtemp[0] = (PetscScalar)ttemp[0];
2812       idt -= 4;
2813     }
2814 
2815   } /* End of artificial scope. */
2816   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2817   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2818   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2819   SSE_SCOPE_END;
2820   PetscFunctionReturn(0);
2821 }
2822 
2823 #endif
2824 #undef __FUNCT__
2825 #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2826 PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
2827 {
2828   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2829   IS                iscol=a->col,isrow=a->row;
2830   PetscErrorCode    ierr;
2831   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2832   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2833   const MatScalar   *aa=a->a,*v;
2834   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2835   const PetscScalar *b;
2836 
2837   PetscFunctionBegin;
2838   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2839   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2840   t  = a->solve_work;
2841 
2842   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2843   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2844 
2845   /* forward solve the lower triangular */
2846   idx    = 3*(*r++);
2847   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
2848   for (i=1; i<n; i++) {
2849     v     = aa + 9*ai[i];
2850     vi    = aj + ai[i];
2851     nz    = diag[i] - ai[i];
2852     idx   = 3*(*r++);
2853     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
2854     while (nz--) {
2855       idx   = 3*(*vi++);
2856       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2857       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2858       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2859       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2860       v += 9;
2861     }
2862     idx = 3*i;
2863     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
2864   }
2865   /* backward solve the upper triangular */
2866   for (i=n-1; i>=0; i--){
2867     v    = aa + 9*diag[i] + 9;
2868     vi   = aj + diag[i] + 1;
2869     nz   = ai[i+1] - diag[i] - 1;
2870     idt  = 3*i;
2871     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
2872     while (nz--) {
2873       idx   = 3*(*vi++);
2874       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2875       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2876       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2877       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2878       v += 9;
2879     }
2880     idc = 3*(*c--);
2881     v   = aa + 9*diag[i];
2882     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2883     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2884     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2885   }
2886   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2887   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2888   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2889   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2890   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
2891   PetscFunctionReturn(0);
2892 }
2893 
2894 /*
2895       Special case where the matrix was ILU(0) factored in the natural
2896    ordering. This eliminates the need for the column and row permutation.
2897 */
2898 #undef __FUNCT__
2899 #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2900 PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2901 {
2902   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2903   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2904   PetscErrorCode    ierr;
2905   PetscInt          *diag = a->diag;
2906   const MatScalar   *aa=a->a,*v;
2907   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2908   const PetscScalar *b;
2909   PetscInt          jdx,idt,idx,nz,*vi,i;
2910 
2911   PetscFunctionBegin;
2912   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2913   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2914 
2915   /* forward solve the lower triangular */
2916   idx    = 0;
2917   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
2918   for (i=1; i<n; i++) {
2919     v     =  aa      + 9*ai[i];
2920     vi    =  aj      + ai[i];
2921     nz    =  diag[i] - ai[i];
2922     idx   +=  3;
2923     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
2924     while (nz--) {
2925       jdx   = 3*(*vi++);
2926       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2927       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2928       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2929       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2930       v    += 9;
2931     }
2932     x[idx]   = s1;
2933     x[1+idx] = s2;
2934     x[2+idx] = s3;
2935   }
2936   /* backward solve the upper triangular */
2937   for (i=n-1; i>=0; i--){
2938     v    = aa + 9*diag[i] + 9;
2939     vi   = aj + diag[i] + 1;
2940     nz   = ai[i+1] - diag[i] - 1;
2941     idt  = 3*i;
2942     s1 = x[idt];  s2 = x[1+idt];
2943     s3 = x[2+idt];
2944     while (nz--) {
2945       idx   = 3*(*vi++);
2946       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2947       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2948       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2949       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
2950       v    += 9;
2951     }
2952     v        = aa +  9*diag[i];
2953     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2954     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2955     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
2956   }
2957 
2958   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2959   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2960   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
2961   PetscFunctionReturn(0);
2962 }
2963 
2964 #undef __FUNCT__
2965 #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2966 PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
2967 {
2968   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2969   IS                iscol=a->col,isrow=a->row;
2970   PetscErrorCode    ierr;
2971   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2972   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2973   const MatScalar   *aa=a->a,*v;
2974   PetscScalar       *x,s1,s2,x1,x2,*t;
2975   const PetscScalar *b;
2976 
2977   PetscFunctionBegin;
2978   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2979   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2980   t  = a->solve_work;
2981 
2982   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2983   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2984 
2985   /* forward solve the lower triangular */
2986   idx    = 2*(*r++);
2987   t[0] = b[idx]; t[1] = b[1+idx];
2988   for (i=1; i<n; i++) {
2989     v     = aa + 4*ai[i];
2990     vi    = aj + ai[i];
2991     nz    = diag[i] - ai[i];
2992     idx   = 2*(*r++);
2993     s1  = b[idx]; s2 = b[1+idx];
2994     while (nz--) {
2995       idx   = 2*(*vi++);
2996       x1    = t[idx]; x2 = t[1+idx];
2997       s1 -= v[0]*x1 + v[2]*x2;
2998       s2 -= v[1]*x1 + v[3]*x2;
2999       v += 4;
3000     }
3001     idx = 2*i;
3002     t[idx] = s1; t[1+idx] = s2;
3003   }
3004   /* backward solve the upper triangular */
3005   for (i=n-1; i>=0; i--){
3006     v    = aa + 4*diag[i] + 4;
3007     vi   = aj + diag[i] + 1;
3008     nz   = ai[i+1] - diag[i] - 1;
3009     idt  = 2*i;
3010     s1 = t[idt]; s2 = t[1+idt];
3011     while (nz--) {
3012       idx   = 2*(*vi++);
3013       x1    = t[idx]; x2 = t[1+idx];
3014       s1 -= v[0]*x1 + v[2]*x2;
3015       s2 -= v[1]*x1 + v[3]*x2;
3016       v += 4;
3017     }
3018     idc = 2*(*c--);
3019     v   = aa + 4*diag[i];
3020     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3021     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
3022   }
3023   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3024   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3025   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3026   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3027   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3028   PetscFunctionReturn(0);
3029 }
3030 
3031 /*
3032       Special case where the matrix was ILU(0) factored in the natural
3033    ordering. This eliminates the need for the column and row permutation.
3034 */
3035 #undef __FUNCT__
3036 #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3037 PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
3038 {
3039   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3040   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3041   PetscErrorCode    ierr;
3042   PetscInt          *diag = a->diag;
3043   const MatScalar   *aa=a->a,*v;
3044   PetscScalar       *x,s1,s2,x1,x2;
3045   const PetscScalar *b;
3046   PetscInt          jdx,idt,idx,nz,*vi,i;
3047 
3048   PetscFunctionBegin;
3049   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3050   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3051 
3052   /* forward solve the lower triangular */
3053   idx    = 0;
3054   x[0]   = b[0]; x[1] = b[1];
3055   for (i=1; i<n; i++) {
3056     v     =  aa      + 4*ai[i];
3057     vi    =  aj      + ai[i];
3058     nz    =  diag[i] - ai[i];
3059     idx   +=  2;
3060     s1  =  b[idx];s2 = b[1+idx];
3061     while (nz--) {
3062       jdx   = 2*(*vi++);
3063       x1    = x[jdx];x2 = x[1+jdx];
3064       s1 -= v[0]*x1 + v[2]*x2;
3065       s2 -= v[1]*x1 + v[3]*x2;
3066       v    += 4;
3067     }
3068     x[idx]   = s1;
3069     x[1+idx] = s2;
3070   }
3071   /* backward solve the upper triangular */
3072   for (i=n-1; i>=0; i--){
3073     v    = aa + 4*diag[i] + 4;
3074     vi   = aj + diag[i] + 1;
3075     nz   = ai[i+1] - diag[i] - 1;
3076     idt  = 2*i;
3077     s1 = x[idt];  s2 = x[1+idt];
3078     while (nz--) {
3079       idx   = 2*(*vi++);
3080       x1    = x[idx];   x2 = x[1+idx];
3081       s1 -= v[0]*x1 + v[2]*x2;
3082       s2 -= v[1]*x1 + v[3]*x2;
3083       v    += 4;
3084     }
3085     v        = aa +  4*diag[i];
3086     x[idt]   = v[0]*s1 + v[2]*s2;
3087     x[1+idt] = v[1]*s1 + v[3]*s2;
3088   }
3089 
3090   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3091   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3092   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3093   PetscFunctionReturn(0);
3094 }
3095 
3096 #undef __FUNCT__
3097 #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3098 PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
3099 {
3100   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
3101   IS             iscol=a->col,isrow=a->row;
3102   PetscErrorCode ierr;
3103   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3104   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
3105   MatScalar      *aa=a->a,*v;
3106   PetscScalar    *x,*b,s1,*t;
3107 
3108   PetscFunctionBegin;
3109   if (!n) PetscFunctionReturn(0);
3110 
3111   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3112   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3113   t  = a->solve_work;
3114 
3115   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3116   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3117 
3118   /* forward solve the lower triangular */
3119   t[0] = b[*r++];
3120   for (i=1; i<n; i++) {
3121     v     = aa + ai[i];
3122     vi    = aj + ai[i];
3123     nz    = diag[i] - ai[i];
3124     s1  = b[*r++];
3125     while (nz--) {
3126       s1 -= (*v++)*t[*vi++];
3127     }
3128     t[i] = s1;
3129   }
3130   /* backward solve the upper triangular */
3131   for (i=n-1; i>=0; i--){
3132     v    = aa + diag[i] + 1;
3133     vi   = aj + diag[i] + 1;
3134     nz   = ai[i+1] - diag[i] - 1;
3135     s1 = t[i];
3136     while (nz--) {
3137       s1 -= (*v++)*t[*vi++];
3138     }
3139     x[*c--] = t[i] = aa[diag[i]]*s1;
3140   }
3141 
3142   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3143   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3144   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3145   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3146   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3147   PetscFunctionReturn(0);
3148 }
3149 /*
3150       Special case where the matrix was ILU(0) factored in the natural
3151    ordering. This eliminates the need for the column and row permutation.
3152 */
3153 #undef __FUNCT__
3154 #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3155 PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
3156 {
3157   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3158   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3159   PetscErrorCode ierr;
3160   PetscInt       *diag = a->diag;
3161   MatScalar      *aa=a->a;
3162   PetscScalar    *x,*b;
3163   PetscScalar    s1,x1;
3164   MatScalar      *v;
3165   PetscInt       jdx,idt,idx,nz,*vi,i;
3166 
3167   PetscFunctionBegin;
3168   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3169   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3170 
3171   /* forward solve the lower triangular */
3172   idx    = 0;
3173   x[0]   = b[0];
3174   for (i=1; i<n; i++) {
3175     v     =  aa      + ai[i];
3176     vi    =  aj      + ai[i];
3177     nz    =  diag[i] - ai[i];
3178     idx   +=  1;
3179     s1  =  b[idx];
3180     while (nz--) {
3181       jdx   = *vi++;
3182       x1    = x[jdx];
3183       s1 -= v[0]*x1;
3184       v    += 1;
3185     }
3186     x[idx]   = s1;
3187   }
3188   /* backward solve the upper triangular */
3189   for (i=n-1; i>=0; i--){
3190     v    = aa + diag[i] + 1;
3191     vi   = aj + diag[i] + 1;
3192     nz   = ai[i+1] - diag[i] - 1;
3193     idt  = i;
3194     s1 = x[idt];
3195     while (nz--) {
3196       idx   = *vi++;
3197       x1    = x[idx];
3198       s1 -= v[0]*x1;
3199       v    += 1;
3200     }
3201     v        = aa +  diag[i];
3202     x[idt]   = v[0]*s1;
3203   }
3204   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3205   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3206   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
3207   PetscFunctionReturn(0);
3208 }
3209 
3210 /* ----------------------------------------------------------------*/
3211 EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3212 EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3213 
3214 extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
3215 #undef __FUNCT__
3216 #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
3217 PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
3218 {
3219   Mat            C=B;
3220   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
3221   IS             isrow = b->row,isicol = b->icol;
3222   PetscErrorCode ierr;
3223   const PetscInt *r,*ic,*ics;
3224   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
3225   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3226   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3227   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3228   MatScalar      *v_work;
3229 
3230   PetscFunctionBegin;
3231   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3232   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3233   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3234   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
3235   ics  = ic;
3236 
3237   /* generate work space needed by dense LU factorization */
3238   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3239   multiplier = v_work + bs;
3240   v_pivots   = (PetscInt*)(multiplier + bs2);
3241 
3242   for (i=0; i<n; i++){
3243     /* zero rtmp */
3244     /* L part */
3245     nz    = bi[i+1] - bi[i];
3246     bjtmp = bj + bi[i];
3247     for  (j=0; j<nz; j++){
3248       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3249     }
3250 
3251     /* U part */
3252     nz = bi[2*n-i+1] - bi[2*n-i];
3253     bjtmp = bj + bi[2*n-i];
3254     for  (j=0; j<nz; j++){
3255       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3256     }
3257 
3258     /* load in initial (unfactored row) */
3259     nz    = ai[r[i]+1] - ai[r[i]];
3260     ajtmp = aj + ai[r[i]];
3261     v     = aa + bs2*ai[r[i]];
3262     for (j=0; j<nz; j++) {
3263       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
3264     }
3265 
3266     /* elimination */
3267     bjtmp = bj + bi[i];
3268     row   = *bjtmp++;
3269     nzL   = bi[i+1] - bi[i];
3270     k   = 0;
3271     while  (k < nzL) {
3272       pc = rtmp + bs2*row;
3273       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3274       if (flg) {
3275         pv         = b->a + bs2*bdiag[row];
3276         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
3277         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3278         pv         = b->a + bs2*bi[2*n-row];
3279         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3280         for (j=0; j<nz; j++) {
3281           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3282         }
3283         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
3284       }
3285       row = *bjtmp++; k++;
3286     }
3287 
3288     /* finished row so stick it into b->a */
3289     /* L part */
3290     pv   = b->a + bs2*bi[i] ;
3291     pj   = b->j + bi[i] ;
3292     nz   = bi[i+1] - bi[i];
3293     for (j=0; j<nz; j++) {
3294       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3295     }
3296 
3297     /* Mark diagonal and invert diagonal for simplier triangular solves */
3298     pv  = b->a + bs2*bdiag[i];
3299     pj  = b->j + bdiag[i];
3300     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3301     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3302     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
3303 
3304     /* U part */
3305     pv = b->a + bs2*bi[2*n-i];
3306     pj = b->j + bi[2*n-i];
3307     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3308     for (j=0; j<nz; j++){
3309       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3310     }
3311   }
3312 
3313   ierr = PetscFree(rtmp);CHKERRQ(ierr);
3314   ierr = PetscFree(v_work);CHKERRQ(ierr);
3315   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3316   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3317 
3318   switch (A->rmap->bs){
3319   case 2:
3320     C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
3321     break;
3322 
3323   case 5:
3324     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3325     break;
3326   default:
3327     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3328     break;
3329   }
3330   C->assembled = PETSC_TRUE;
3331   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
3332   PetscFunctionReturn(0);
3333 }
3334 
3335 /*
3336    ilu(0) with natural ordering under new data structure.
3337    Factored arrays bj and ba are stored as
3338      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
3339 
3340    bi=fact->i is an array of size 2n+2, in which
3341    bi+
3342      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
3343      bi[n]      ->  end of L(n-1,:)+1
3344      bi[n+1]    ->  1st entry of U(n-1,:)
3345      bi[2n-i]   ->  1st entry of U(i,:)
3346      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
3347      bi[2n]     ->  end of U(0,:)+1
3348 
3349    U(i,:) contains diag[i] as its last entry, i.e.,
3350     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
3351 */
3352 #undef __FUNCT__
3353 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
3354 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3355 {
3356 
3357   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
3358   PetscErrorCode     ierr;
3359   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
3360   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
3361 
3362   PetscFunctionBegin;
3363   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3364   b     = (Mat_SeqBAIJ*)(fact)->data;
3365   bdiag = b->diag;
3366 
3367   /* replace matrix arrays with single allocations, then reset values */
3368   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
3369 
3370   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
3371   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
3372   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
3373   b->singlemalloc = PETSC_FALSE;
3374   if (mbs > 0) {
3375     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
3376   }
3377 
3378   /* set bi and bj with new data structure */
3379   bi = b->i;
3380   bj = b->j;
3381 
3382   /* L part */
3383   bi[0] = 0;
3384   for (i=0; i<mbs; i++){
3385     nz = adiag[i] - ai[i];
3386     bi[i+1] = bi[i] + nz;
3387     aj = a->j + ai[i];
3388     for (j=0; j<nz; j++){
3389       *bj = aj[j]; bj++;
3390     }
3391   }
3392 
3393   /* U part */
3394   bi[mbs+1] = bi[mbs];
3395   for (i=mbs-1; i>=0; i--){
3396     nz = ai[i+1] - adiag[i] - 1;
3397     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3398     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
3399     aj = a->j + adiag[i] + 1;
3400     for (j=0; j<nz; j++){
3401       *bj = aj[j]; bj++;
3402     }
3403     /* diag[i] */
3404     *bj = i; bj++;
3405     bdiag[i] = bi[2*mbs-i+1]-1;
3406   }
3407   PetscFunctionReturn(0);
3408 }
3409 
3410 /*
3411      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
3412    except that the data structure of Mat_SeqAIJ is slightly different.
3413    Not a good example of code reuse.
3414 */
3415 
3416 #undef __FUNCT__
3417 #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3418 PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
3419 {
3420   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
3421   IS             isicol;
3422   PetscErrorCode ierr;
3423   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3424   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3425   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3426   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
3427   PetscTruth     col_identity,row_identity,both_identity,flg;
3428   PetscReal      f;
3429 
3430   PetscFunctionBegin;
3431   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
3432   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
3433 
3434   f             = info->fill;
3435   levels        = (PetscInt)info->levels;
3436   diagonal_fill = (PetscInt)info->diagonal_fill;
3437   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3438   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3439   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3440   both_identity = (PetscTruth) (row_identity && col_identity);
3441 
3442   if (!levels && both_identity) {  /* special case copy the nonzero structure */
3443 
3444     PetscTruth newdatastruct=PETSC_FALSE;
3445     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
3446     if (newdatastruct){
3447       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
3448       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3449     } else {
3450       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3451       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3452     }
3453 
3454     fact->factor = MAT_FACTOR_ILU;
3455     b            = (Mat_SeqBAIJ*)(fact)->data;
3456     b->row       = isrow;
3457     b->col       = iscol;
3458     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3459     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3460     b->icol      = isicol;
3461     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3462     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3463     PetscFunctionReturn(0);
3464   }
3465 
3466   /* general case perform the symbolic factorization */
3467     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
3468     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3469 
3470     /* get new row pointers */
3471     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
3472     ainew[0] = 0;
3473     /* don't know how many column pointers are needed so estimate */
3474     jmax = (PetscInt)(f*ai[n] + 1);
3475     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
3476     /* ajfill is level of fill for each fill entry */
3477     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
3478     /* fill is a linked list of nonzeros in active row */
3479     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
3480     /* im is level for each filled value */
3481     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
3482     /* dloc is location of diagonal in factor */
3483     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
3484     dloc[0]  = 0;
3485     for (prow=0; prow<n; prow++) {
3486 
3487       /* copy prow into linked list */
3488       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
3489       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
3490       xi         = aj + ai[r[prow]];
3491       fill[n]    = n;
3492       fill[prow] = -1; /* marker for diagonal entry */
3493       while (nz--) {
3494 	fm  = n;
3495 	idx = ic[*xi++];
3496 	do {
3497 	  m  = fm;
3498 	  fm = fill[m];
3499 	} while (fm < idx);
3500 	fill[m]   = idx;
3501 	fill[idx] = fm;
3502 	im[idx]   = 0;
3503       }
3504 
3505       /* make sure diagonal entry is included */
3506       if (diagonal_fill && fill[prow] == -1) {
3507 	fm = n;
3508 	while (fill[fm] < prow) fm = fill[fm];
3509 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3510 	fill[fm]   = prow;
3511 	im[prow]   = 0;
3512 	nzf++;
3513 	dcount++;
3514       }
3515 
3516       nzi = 0;
3517       row = fill[n];
3518       while (row < prow) {
3519 	incrlev = im[row] + 1;
3520 	nz      = dloc[row];
3521 	xi      = ajnew  + ainew[row] + nz + 1;
3522 	flev    = ajfill + ainew[row] + nz + 1;
3523 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
3524 	fm      = row;
3525 	while (nnz-- > 0) {
3526 	  idx = *xi++;
3527 	  if (*flev + incrlev > levels) {
3528 	    flev++;
3529 	    continue;
3530 	  }
3531 	  do {
3532 	    m  = fm;
3533 	    fm = fill[m];
3534 	  } while (fm < idx);
3535 	  if (fm != idx) {
3536 	    im[idx]   = *flev + incrlev;
3537 	    fill[m]   = idx;
3538 	    fill[idx] = fm;
3539 	    fm        = idx;
3540 	    nzf++;
3541 	  } else {
3542 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
3543 	  }
3544 	  flev++;
3545 	}
3546 	row = fill[row];
3547 	nzi++;
3548       }
3549       /* copy new filled row into permanent storage */
3550       ainew[prow+1] = ainew[prow] + nzf;
3551       if (ainew[prow+1] > jmax) {
3552 
3553 	/* estimate how much additional space we will need */
3554 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3555 	/* just double the memory each time */
3556 	PetscInt maxadd = jmax;
3557 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
3558 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
3559 	jmax += maxadd;
3560 
3561 	/* allocate a longer ajnew and ajfill */
3562 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3563 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3564 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3565 	ajnew = xitmp;
3566 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3567 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3568 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3569 	ajfill = xitmp;
3570 	reallocate++; /* count how many reallocations are needed */
3571       }
3572       xitmp       = ajnew + ainew[prow];
3573       flev        = ajfill + ainew[prow];
3574       dloc[prow]  = nzi;
3575       fm          = fill[n];
3576       while (nzf--) {
3577 	*xitmp++ = fm;
3578 	*flev++ = im[fm];
3579 	fm      = fill[fm];
3580       }
3581       /* make sure row has diagonal entry */
3582       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
3583 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
3584     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3585       }
3586     }
3587     ierr = PetscFree(ajfill);CHKERRQ(ierr);
3588     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3589     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3590     ierr = PetscFree(fill);CHKERRQ(ierr);
3591     ierr = PetscFree(im);CHKERRQ(ierr);
3592 
3593 #if defined(PETSC_USE_INFO)
3594     {
3595       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3596       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3597       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3598       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3599       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3600       if (diagonal_fill) {
3601 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3602       }
3603     }
3604 #endif
3605 
3606     /* put together the new matrix */
3607     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3608     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3609     b    = (Mat_SeqBAIJ*)(fact)->data;
3610     b->free_a       = PETSC_TRUE;
3611     b->free_ij      = PETSC_TRUE;
3612     b->singlemalloc = PETSC_FALSE;
3613     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
3614     b->j          = ajnew;
3615     b->i          = ainew;
3616     for (i=0; i<n; i++) dloc[i] += ainew[i];
3617     b->diag       = dloc;
3618     b->ilen       = 0;
3619     b->imax       = 0;
3620     b->row        = isrow;
3621     b->col        = iscol;
3622     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3623     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3624     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3625     b->icol       = isicol;
3626     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3627     /* In b structure:  Free imax, ilen, old a, old j.
3628        Allocate dloc, solve_work, new a, new j */
3629     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
3630     b->maxnz          = b->nz = ainew[n];
3631 
3632     (fact)->info.factor_mallocs    = reallocate;
3633     (fact)->info.fill_ratio_given  = f;
3634     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3635 
3636   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
3637   PetscFunctionReturn(0);
3638 }
3639 
3640 #undef __FUNCT__
3641 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3642 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
3643 {
3644   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
3645   /* int i,*AJ=a->j,nz=a->nz; */
3646   PetscFunctionBegin;
3647   /* Undo Column scaling */
3648 /*    while (nz--) { */
3649 /*      AJ[i] = AJ[i]/4; */
3650 /*    } */
3651   /* This should really invoke a push/pop logic, but we don't have that yet. */
3652   A->ops->setunfactored = PETSC_NULL;
3653   PetscFunctionReturn(0);
3654 }
3655 
3656 #undef __FUNCT__
3657 #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3658 PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
3659 {
3660   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3661   PetscInt       *AJ=a->j,nz=a->nz;
3662   unsigned short *aj=(unsigned short *)AJ;
3663   PetscFunctionBegin;
3664   /* Is this really necessary? */
3665   while (nz--) {
3666     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
3667   }
3668   A->ops->setunfactored = PETSC_NULL;
3669   PetscFunctionReturn(0);
3670 }
3671 
3672 
3673