xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ccaa8a1b5642edce604956fbfb21a097c8dd4f5b)
173f4d377SMatthew Knepley /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
74e2b4712SSatish Balay #include "src/vec/vecimpl.h"
84e2b4712SSatish Balay #include "src/inline/ilu.h"
974c49faeSBarry Smith #include "src/inline/dot.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17f1af5d2fSBarry Smith   int             *diag = a->diag;
18f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
1987828ca2SBarry Smith   PetscScalar     s1,*x,*b;
20f1af5d2fSBarry Smith 
21f1af5d2fSBarry Smith   PetscFunctionBegin;
22ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
23f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25f1af5d2fSBarry Smith 
26f1af5d2fSBarry Smith   /* forward solve the U^T */
27f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith     v     = aa + diag[i];
30f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
31ef66eb69SBarry Smith     s1    = (*v++)*x[i];
32f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
33f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
34f1af5d2fSBarry Smith     while (nz--) {
35f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
36f1af5d2fSBarry Smith     }
37f1af5d2fSBarry Smith     x[i]   = s1;
38f1af5d2fSBarry Smith   }
39f1af5d2fSBarry Smith   /* backward solve the L^T */
40f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
41f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
42f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
43f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
44f1af5d2fSBarry Smith     s1   = x[i];
45f1af5d2fSBarry Smith     while (nz--) {
46f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
47f1af5d2fSBarry Smith     }
48f1af5d2fSBarry Smith   }
49f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
52f1af5d2fSBarry Smith   PetscFunctionReturn(0);
53f1af5d2fSBarry Smith }
54f1af5d2fSBarry Smith 
554a2ae208SSatish Balay #undef __FUNCT__
564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58f1af5d2fSBarry Smith {
59f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
62f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
6387828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
6487828ca2SBarry Smith   PetscScalar     *x,*b;
65f1af5d2fSBarry Smith 
66f1af5d2fSBarry Smith   PetscFunctionBegin;
67ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70f1af5d2fSBarry Smith 
71f1af5d2fSBarry Smith   /* forward solve the U^T */
72f1af5d2fSBarry Smith   idx = 0;
73f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
76f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
77ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
78f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
79f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
80f1af5d2fSBarry Smith     v += 4;
81f1af5d2fSBarry Smith 
82f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
83f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
84f1af5d2fSBarry Smith     while (nz--) {
85f1af5d2fSBarry Smith       oidx = 2*(*vi++);
86f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88f1af5d2fSBarry Smith       v  += 4;
89f1af5d2fSBarry Smith     }
90f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
91f1af5d2fSBarry Smith     idx += 2;
92f1af5d2fSBarry Smith   }
93f1af5d2fSBarry Smith   /* backward solve the L^T */
94f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
95f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
96f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
97f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
98f1af5d2fSBarry Smith     idt  = 2*i;
99f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
100f1af5d2fSBarry Smith     while (nz--) {
101f1af5d2fSBarry Smith       idx   = 2*(*vi--);
102f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104f1af5d2fSBarry Smith       v -= 4;
105f1af5d2fSBarry Smith     }
106f1af5d2fSBarry Smith   }
107f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110f1af5d2fSBarry Smith   PetscFunctionReturn(0);
111f1af5d2fSBarry Smith }
112f1af5d2fSBarry Smith 
1134a2ae208SSatish Balay #undef __FUNCT__
1144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
1157c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116f1af5d2fSBarry Smith {
117f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
120f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
12187828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
12287828ca2SBarry Smith   PetscScalar     *x,*b;
123f1af5d2fSBarry Smith 
124f1af5d2fSBarry Smith   PetscFunctionBegin;
125ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
126f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
127f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   /* forward solve the U^T */
130f1af5d2fSBarry Smith   idx = 0;
131f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
132f1af5d2fSBarry Smith 
133f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
134f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
135ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139f1af5d2fSBarry Smith     v += 9;
140f1af5d2fSBarry Smith 
141f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
142f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
143f1af5d2fSBarry Smith     while (nz--) {
144f1af5d2fSBarry Smith       oidx = 3*(*vi++);
145f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148f1af5d2fSBarry Smith       v  += 9;
149f1af5d2fSBarry Smith     }
150f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151f1af5d2fSBarry Smith     idx += 3;
152f1af5d2fSBarry Smith   }
153f1af5d2fSBarry Smith   /* backward solve the L^T */
154f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
155f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
156f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
157f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
158f1af5d2fSBarry Smith     idt  = 3*i;
159f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160f1af5d2fSBarry Smith     while (nz--) {
161f1af5d2fSBarry Smith       idx   = 3*(*vi--);
162f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165f1af5d2fSBarry Smith       v -= 9;
166f1af5d2fSBarry Smith     }
167f1af5d2fSBarry Smith   }
168f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
169f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171f1af5d2fSBarry Smith   PetscFunctionReturn(0);
172f1af5d2fSBarry Smith }
173f1af5d2fSBarry Smith 
1744a2ae208SSatish Balay #undef __FUNCT__
1754a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
1767c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177f1af5d2fSBarry Smith {
178f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
181f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
18287828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
18387828ca2SBarry Smith   PetscScalar     *x,*b;
184f1af5d2fSBarry Smith 
185f1af5d2fSBarry Smith   PetscFunctionBegin;
186ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
187f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189f1af5d2fSBarry Smith 
190f1af5d2fSBarry Smith   /* forward solve the U^T */
191f1af5d2fSBarry Smith   idx = 0;
192f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
195f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
196ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201f1af5d2fSBarry Smith     v += 16;
202f1af5d2fSBarry Smith 
203f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
204f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
205f1af5d2fSBarry Smith     while (nz--) {
206f1af5d2fSBarry Smith       oidx = 4*(*vi++);
207f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211f1af5d2fSBarry Smith       v  += 16;
212f1af5d2fSBarry Smith     }
213f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214f1af5d2fSBarry Smith     idx += 4;
215f1af5d2fSBarry Smith   }
216f1af5d2fSBarry Smith   /* backward solve the L^T */
217f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
218f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
219f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
220f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
221f1af5d2fSBarry Smith     idt  = 4*i;
222f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223f1af5d2fSBarry Smith     while (nz--) {
224f1af5d2fSBarry Smith       idx   = 4*(*vi--);
225f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229f1af5d2fSBarry Smith       v -= 16;
230f1af5d2fSBarry Smith     }
231f1af5d2fSBarry Smith   }
232f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235f1af5d2fSBarry Smith   PetscFunctionReturn(0);
236f1af5d2fSBarry Smith }
237f1af5d2fSBarry Smith 
2384a2ae208SSatish Balay #undef __FUNCT__
2394a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
2407c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241f1af5d2fSBarry Smith {
242f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
245f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
24687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
24787828ca2SBarry Smith   PetscScalar     *x,*b;
248f1af5d2fSBarry Smith 
249f1af5d2fSBarry Smith   PetscFunctionBegin;
250ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
251f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
252f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253f1af5d2fSBarry Smith 
254f1af5d2fSBarry Smith   /* forward solve the U^T */
255f1af5d2fSBarry Smith   idx = 0;
256f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
257f1af5d2fSBarry Smith 
258f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
259f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
260ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266f1af5d2fSBarry Smith     v += 25;
267f1af5d2fSBarry Smith 
268f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
269f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
270f1af5d2fSBarry Smith     while (nz--) {
271f1af5d2fSBarry Smith       oidx = 5*(*vi++);
272f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277f1af5d2fSBarry Smith       v  += 25;
278f1af5d2fSBarry Smith     }
279f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280f1af5d2fSBarry Smith     idx += 5;
281f1af5d2fSBarry Smith   }
282f1af5d2fSBarry Smith   /* backward solve the L^T */
283f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
284f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
285f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
286f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
287f1af5d2fSBarry Smith     idt  = 5*i;
288f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289f1af5d2fSBarry Smith     while (nz--) {
290f1af5d2fSBarry Smith       idx   = 5*(*vi--);
291f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296f1af5d2fSBarry Smith       v -= 25;
297f1af5d2fSBarry Smith     }
298f1af5d2fSBarry Smith   }
299f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
300f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
301b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302f1af5d2fSBarry Smith   PetscFunctionReturn(0);
303f1af5d2fSBarry Smith }
304f1af5d2fSBarry Smith 
3054a2ae208SSatish Balay #undef __FUNCT__
3064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
3077c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308f1af5d2fSBarry Smith {
309f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
312f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
31387828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
31487828ca2SBarry Smith   PetscScalar     *x,*b;
315f1af5d2fSBarry Smith 
316f1af5d2fSBarry Smith   PetscFunctionBegin;
317ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
318f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
319f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith   /* forward solve the U^T */
322f1af5d2fSBarry Smith   idx = 0;
323f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
324f1af5d2fSBarry Smith 
325f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
326f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
327ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328ef66eb69SBarry Smith     x6    = x[5+idx];
329f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335f1af5d2fSBarry Smith     v += 36;
336f1af5d2fSBarry Smith 
337f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
338f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
339f1af5d2fSBarry Smith     while (nz--) {
340f1af5d2fSBarry Smith       oidx = 6*(*vi++);
341f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347f1af5d2fSBarry Smith       v  += 36;
348f1af5d2fSBarry Smith     }
349f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350f1af5d2fSBarry Smith     x[5+idx] = s6;
351f1af5d2fSBarry Smith     idx += 6;
352f1af5d2fSBarry Smith   }
353f1af5d2fSBarry Smith   /* backward solve the L^T */
354f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
355f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
356f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
357f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
358f1af5d2fSBarry Smith     idt  = 6*i;
359f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360f1af5d2fSBarry Smith     s6 = x[5+idt];
361f1af5d2fSBarry Smith     while (nz--) {
362f1af5d2fSBarry Smith       idx   = 6*(*vi--);
363f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369f1af5d2fSBarry Smith       v -= 36;
370f1af5d2fSBarry Smith     }
371f1af5d2fSBarry Smith   }
372f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
373f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375f1af5d2fSBarry Smith   PetscFunctionReturn(0);
376f1af5d2fSBarry Smith }
377f1af5d2fSBarry Smith 
3784a2ae208SSatish Balay #undef __FUNCT__
3794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
3807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381f1af5d2fSBarry Smith {
382f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
385f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
38687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
38787828ca2SBarry Smith   PetscScalar     *x,*b;
388f1af5d2fSBarry Smith 
389f1af5d2fSBarry Smith   PetscFunctionBegin;
390ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
391f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
392f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393f1af5d2fSBarry Smith 
394f1af5d2fSBarry Smith   /* forward solve the U^T */
395f1af5d2fSBarry Smith   idx = 0;
396f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
399f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
400ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
402f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409f1af5d2fSBarry Smith     v += 49;
410f1af5d2fSBarry Smith 
411f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
412f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
413f1af5d2fSBarry Smith     while (nz--) {
414f1af5d2fSBarry Smith       oidx = 7*(*vi++);
415f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422f1af5d2fSBarry Smith       v  += 49;
423f1af5d2fSBarry Smith     }
424f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
426f1af5d2fSBarry Smith     idx += 7;
427f1af5d2fSBarry Smith   }
428f1af5d2fSBarry Smith   /* backward solve the L^T */
429f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
430f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
431f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
432f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
433f1af5d2fSBarry Smith     idt  = 7*i;
434f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
436f1af5d2fSBarry Smith     while (nz--) {
437f1af5d2fSBarry Smith       idx   = 7*(*vi--);
438f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445f1af5d2fSBarry Smith       v -= 49;
446f1af5d2fSBarry Smith     }
447f1af5d2fSBarry Smith   }
448f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451f1af5d2fSBarry Smith   PetscFunctionReturn(0);
452f1af5d2fSBarry Smith }
453f1af5d2fSBarry Smith 
454f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4554a2ae208SSatish Balay #undef __FUNCT__
4564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
4577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458f1af5d2fSBarry Smith {
459f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
461f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462f1af5d2fSBarry Smith   int             *diag = a->diag;
463f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
46487828ca2SBarry Smith   PetscScalar     s1,*x,*b,*t;
465f1af5d2fSBarry Smith 
466f1af5d2fSBarry Smith   PetscFunctionBegin;
467f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
468f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
469f1af5d2fSBarry Smith   t  = a->solve_work;
470f1af5d2fSBarry Smith 
471f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
472f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
473f1af5d2fSBarry Smith 
474f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
475f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
476f1af5d2fSBarry Smith     t[i] = b[c[i]];
477f1af5d2fSBarry Smith   }
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith   /* forward solve the U^T */
480f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith     v     = aa + diag[i];
483f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
484f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
485f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
486f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
487f1af5d2fSBarry Smith     while (nz--) {
488f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
489f1af5d2fSBarry Smith     }
490f1af5d2fSBarry Smith     t[i]   = s1;
491f1af5d2fSBarry Smith   }
492f1af5d2fSBarry Smith   /* backward solve the L^T */
493f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
494f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
495f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
496f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
497f1af5d2fSBarry Smith     s1   = t[i];
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith   }
502f1af5d2fSBarry Smith 
503f1af5d2fSBarry Smith   /* copy t into x according to permutation */
504f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
505f1af5d2fSBarry Smith     x[r[i]]   = t[i];
506f1af5d2fSBarry Smith   }
507f1af5d2fSBarry Smith 
508f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
509f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
510f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
512b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
513f1af5d2fSBarry Smith   PetscFunctionReturn(0);
514f1af5d2fSBarry Smith }
515f1af5d2fSBarry Smith 
5164a2ae208SSatish Balay #undef __FUNCT__
5174a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
5187c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519f1af5d2fSBarry Smith {
520f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
522f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
524f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
52587828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
52687828ca2SBarry Smith   PetscScalar     *x,*b,*t;
527f1af5d2fSBarry Smith 
528f1af5d2fSBarry Smith   PetscFunctionBegin;
529f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
530f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
531f1af5d2fSBarry Smith   t  = a->solve_work;
532f1af5d2fSBarry Smith 
533f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
534f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
535f1af5d2fSBarry Smith 
536f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
537f1af5d2fSBarry Smith   ii = 0;
538f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
539f1af5d2fSBarry Smith     ic      = 2*c[i];
540f1af5d2fSBarry Smith     t[ii]   = b[ic];
541f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
542f1af5d2fSBarry Smith     ii += 2;
543f1af5d2fSBarry Smith   }
544f1af5d2fSBarry Smith 
545f1af5d2fSBarry Smith   /* forward solve the U^T */
546f1af5d2fSBarry Smith   idx = 0;
547f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
550f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
551f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
552f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
553f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
554f1af5d2fSBarry Smith     v += 4;
555f1af5d2fSBarry Smith 
556f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
557f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
558f1af5d2fSBarry Smith     while (nz--) {
559f1af5d2fSBarry Smith       oidx = 2*(*vi++);
560f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562f1af5d2fSBarry Smith       v  += 4;
563f1af5d2fSBarry Smith     }
564f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
565f1af5d2fSBarry Smith     idx += 2;
566f1af5d2fSBarry Smith   }
567f1af5d2fSBarry Smith   /* backward solve the L^T */
568f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
569f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
570f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
571f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
572f1af5d2fSBarry Smith     idt  = 2*i;
573f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
574f1af5d2fSBarry Smith     while (nz--) {
575f1af5d2fSBarry Smith       idx   = 2*(*vi--);
576f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578f1af5d2fSBarry Smith       v -= 4;
579f1af5d2fSBarry Smith     }
580f1af5d2fSBarry Smith   }
581f1af5d2fSBarry Smith 
582f1af5d2fSBarry Smith   /* copy t into x according to permutation */
583f1af5d2fSBarry Smith   ii = 0;
584f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
585f1af5d2fSBarry Smith     ir      = 2*r[i];
586f1af5d2fSBarry Smith     x[ir]   = t[ii];
587f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
588f1af5d2fSBarry Smith     ii += 2;
589f1af5d2fSBarry Smith   }
590f1af5d2fSBarry Smith 
591f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
592f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
593f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
594f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
595b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596f1af5d2fSBarry Smith   PetscFunctionReturn(0);
597f1af5d2fSBarry Smith }
598f1af5d2fSBarry Smith 
5994a2ae208SSatish Balay #undef __FUNCT__
6004a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
6017c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602f1af5d2fSBarry Smith {
603f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
605f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
607f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
60887828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
60987828ca2SBarry Smith   PetscScalar     *x,*b,*t;
610f1af5d2fSBarry Smith 
611f1af5d2fSBarry Smith   PetscFunctionBegin;
612f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
613f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
614f1af5d2fSBarry Smith   t  = a->solve_work;
615f1af5d2fSBarry Smith 
616f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
617f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
618f1af5d2fSBarry Smith 
619f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
620f1af5d2fSBarry Smith   ii = 0;
621f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
622f1af5d2fSBarry Smith     ic      = 3*c[i];
623f1af5d2fSBarry Smith     t[ii]   = b[ic];
624f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
625f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
626f1af5d2fSBarry Smith     ii += 3;
627f1af5d2fSBarry Smith   }
628f1af5d2fSBarry Smith 
629f1af5d2fSBarry Smith   /* forward solve the U^T */
630f1af5d2fSBarry Smith   idx = 0;
631f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
632f1af5d2fSBarry Smith 
633f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
634f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
635f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639f1af5d2fSBarry Smith     v += 9;
640f1af5d2fSBarry Smith 
641f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
642f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
643f1af5d2fSBarry Smith     while (nz--) {
644f1af5d2fSBarry Smith       oidx = 3*(*vi++);
645f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648f1af5d2fSBarry Smith       v  += 9;
649f1af5d2fSBarry Smith     }
650f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651f1af5d2fSBarry Smith     idx += 3;
652f1af5d2fSBarry Smith   }
653f1af5d2fSBarry Smith   /* backward solve the L^T */
654f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
655f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
656f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
657f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
658f1af5d2fSBarry Smith     idt  = 3*i;
659f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660f1af5d2fSBarry Smith     while (nz--) {
661f1af5d2fSBarry Smith       idx   = 3*(*vi--);
662f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665f1af5d2fSBarry Smith       v -= 9;
666f1af5d2fSBarry Smith     }
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith 
669f1af5d2fSBarry Smith   /* copy t into x according to permutation */
670f1af5d2fSBarry Smith   ii = 0;
671f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
672f1af5d2fSBarry Smith     ir      = 3*r[i];
673f1af5d2fSBarry Smith     x[ir]   = t[ii];
674f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
675f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
676f1af5d2fSBarry Smith     ii += 3;
677f1af5d2fSBarry Smith   }
678f1af5d2fSBarry Smith 
679f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
680f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
681f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
682f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684f1af5d2fSBarry Smith   PetscFunctionReturn(0);
685f1af5d2fSBarry Smith }
686f1af5d2fSBarry Smith 
6874a2ae208SSatish Balay #undef __FUNCT__
6884a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
6897c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690f1af5d2fSBarry Smith {
691f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
693f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
695f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
69687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
69787828ca2SBarry Smith   PetscScalar     *x,*b,*t;
698f1af5d2fSBarry Smith 
699f1af5d2fSBarry Smith   PetscFunctionBegin;
700f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
701f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702f1af5d2fSBarry Smith   t  = a->solve_work;
703f1af5d2fSBarry Smith 
704f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
705f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
706f1af5d2fSBarry Smith 
707f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
708f1af5d2fSBarry Smith   ii = 0;
709f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
710f1af5d2fSBarry Smith     ic      = 4*c[i];
711f1af5d2fSBarry Smith     t[ii]   = b[ic];
712f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
713f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
714f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
715f1af5d2fSBarry Smith     ii += 4;
716f1af5d2fSBarry Smith   }
717f1af5d2fSBarry Smith 
718f1af5d2fSBarry Smith   /* forward solve the U^T */
719f1af5d2fSBarry Smith   idx = 0;
720f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
721f1af5d2fSBarry Smith 
722f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
723f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
724f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729f1af5d2fSBarry Smith     v += 16;
730f1af5d2fSBarry Smith 
731f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
732f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
733f1af5d2fSBarry Smith     while (nz--) {
734f1af5d2fSBarry Smith       oidx = 4*(*vi++);
735f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739f1af5d2fSBarry Smith       v  += 16;
740f1af5d2fSBarry Smith     }
741f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742f1af5d2fSBarry Smith     idx += 4;
743f1af5d2fSBarry Smith   }
744f1af5d2fSBarry Smith   /* backward solve the L^T */
745f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
746f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
747f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
748f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
749f1af5d2fSBarry Smith     idt  = 4*i;
750f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751f1af5d2fSBarry Smith     while (nz--) {
752f1af5d2fSBarry Smith       idx   = 4*(*vi--);
753f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757f1af5d2fSBarry Smith       v -= 16;
758f1af5d2fSBarry Smith     }
759f1af5d2fSBarry Smith   }
760f1af5d2fSBarry Smith 
761f1af5d2fSBarry Smith   /* copy t into x according to permutation */
762f1af5d2fSBarry Smith   ii = 0;
763f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
764f1af5d2fSBarry Smith     ir      = 4*r[i];
765f1af5d2fSBarry Smith     x[ir]   = t[ii];
766f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
767f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
768f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
769f1af5d2fSBarry Smith     ii += 4;
770f1af5d2fSBarry Smith   }
771f1af5d2fSBarry Smith 
772f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
773f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
774f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
775f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
776b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777f1af5d2fSBarry Smith   PetscFunctionReturn(0);
778f1af5d2fSBarry Smith }
779f1af5d2fSBarry Smith 
7804a2ae208SSatish Balay #undef __FUNCT__
7814a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
7827c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783f1af5d2fSBarry Smith {
784f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
786f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
788f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
78987828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
79087828ca2SBarry Smith   PetscScalar     *x,*b,*t;
791f1af5d2fSBarry Smith 
792f1af5d2fSBarry Smith   PetscFunctionBegin;
793f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795f1af5d2fSBarry Smith   t  = a->solve_work;
796f1af5d2fSBarry Smith 
797f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
799f1af5d2fSBarry Smith 
800f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
801f1af5d2fSBarry Smith   ii = 0;
802f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
803f1af5d2fSBarry Smith     ic      = 5*c[i];
804f1af5d2fSBarry Smith     t[ii]   = b[ic];
805f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
806f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
807f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
808f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
809f1af5d2fSBarry Smith     ii += 5;
810f1af5d2fSBarry Smith   }
811f1af5d2fSBarry Smith 
812f1af5d2fSBarry Smith   /* forward solve the U^T */
813f1af5d2fSBarry Smith   idx = 0;
814f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
817f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
818f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824f1af5d2fSBarry Smith     v += 25;
825f1af5d2fSBarry Smith 
826f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
827f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
828f1af5d2fSBarry Smith     while (nz--) {
829f1af5d2fSBarry Smith       oidx = 5*(*vi++);
830f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835f1af5d2fSBarry Smith       v  += 25;
836f1af5d2fSBarry Smith     }
837f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838f1af5d2fSBarry Smith     idx += 5;
839f1af5d2fSBarry Smith   }
840f1af5d2fSBarry Smith   /* backward solve the L^T */
841f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
842f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
843f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
844f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
845f1af5d2fSBarry Smith     idt  = 5*i;
846f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       idx   = 5*(*vi--);
849f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v -= 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith   }
857f1af5d2fSBarry Smith 
858f1af5d2fSBarry Smith   /* copy t into x according to permutation */
859f1af5d2fSBarry Smith   ii = 0;
860f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
861f1af5d2fSBarry Smith     ir      = 5*r[i];
862f1af5d2fSBarry Smith     x[ir]   = t[ii];
863f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
864f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
865f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
866f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
867f1af5d2fSBarry Smith     ii += 5;
868f1af5d2fSBarry Smith   }
869f1af5d2fSBarry Smith 
870f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
871f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
872f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
873f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
874b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875f1af5d2fSBarry Smith   PetscFunctionReturn(0);
876f1af5d2fSBarry Smith }
877f1af5d2fSBarry Smith 
8784a2ae208SSatish Balay #undef __FUNCT__
8794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
8807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881f1af5d2fSBarry Smith {
882f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
884f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
886f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
88787828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
88887828ca2SBarry Smith   PetscScalar     *x,*b,*t;
889f1af5d2fSBarry Smith 
890f1af5d2fSBarry Smith   PetscFunctionBegin;
891f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
892f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
893f1af5d2fSBarry Smith   t  = a->solve_work;
894f1af5d2fSBarry Smith 
895f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
896f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
897f1af5d2fSBarry Smith 
898f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
899f1af5d2fSBarry Smith   ii = 0;
900f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
901f1af5d2fSBarry Smith     ic      = 6*c[i];
902f1af5d2fSBarry Smith     t[ii]   = b[ic];
903f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
904f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
905f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
906f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
907f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
908f1af5d2fSBarry Smith     ii += 6;
909f1af5d2fSBarry Smith   }
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   /* forward solve the U^T */
912f1af5d2fSBarry Smith   idx = 0;
913f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
914f1af5d2fSBarry Smith 
915f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
916f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
917f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918f1af5d2fSBarry Smith     x6    = t[5+idx];
919f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925f1af5d2fSBarry Smith     v += 36;
926f1af5d2fSBarry Smith 
927f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
928f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
929f1af5d2fSBarry Smith     while (nz--) {
930f1af5d2fSBarry Smith       oidx = 6*(*vi++);
931f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937f1af5d2fSBarry Smith       v  += 36;
938f1af5d2fSBarry Smith     }
939f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940f1af5d2fSBarry Smith     t[5+idx] = s6;
941f1af5d2fSBarry Smith     idx += 6;
942f1af5d2fSBarry Smith   }
943f1af5d2fSBarry Smith   /* backward solve the L^T */
944f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
945f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
946f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
947f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
948f1af5d2fSBarry Smith     idt  = 6*i;
949f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950f1af5d2fSBarry Smith     s6 = t[5+idt];
951f1af5d2fSBarry Smith     while (nz--) {
952f1af5d2fSBarry Smith       idx   = 6*(*vi--);
953f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959f1af5d2fSBarry Smith       v -= 36;
960f1af5d2fSBarry Smith     }
961f1af5d2fSBarry Smith   }
962f1af5d2fSBarry Smith 
963f1af5d2fSBarry Smith   /* copy t into x according to permutation */
964f1af5d2fSBarry Smith   ii = 0;
965f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
966f1af5d2fSBarry Smith     ir      = 6*r[i];
967f1af5d2fSBarry Smith     x[ir]   = t[ii];
968f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
969f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
970f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
971f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
972f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
973f1af5d2fSBarry Smith     ii += 6;
974f1af5d2fSBarry Smith   }
975f1af5d2fSBarry Smith 
976f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
977f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
978f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
979f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
980b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981f1af5d2fSBarry Smith   PetscFunctionReturn(0);
982f1af5d2fSBarry Smith }
983f1af5d2fSBarry Smith 
9844a2ae208SSatish Balay #undef __FUNCT__
9854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
9867c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987f1af5d2fSBarry Smith {
988f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
990f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
992f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
99387828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
99487828ca2SBarry Smith   PetscScalar     *x,*b,*t;
995f1af5d2fSBarry Smith 
996f1af5d2fSBarry Smith   PetscFunctionBegin;
997f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
999f1af5d2fSBarry Smith   t  = a->solve_work;
1000f1af5d2fSBarry Smith 
1001f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1002f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1003f1af5d2fSBarry Smith 
1004f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1005f1af5d2fSBarry Smith   ii = 0;
1006f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1007f1af5d2fSBarry Smith     ic      = 7*c[i];
1008f1af5d2fSBarry Smith     t[ii]   = b[ic];
1009f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1010f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1011f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1012f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1013f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1014f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1015f1af5d2fSBarry Smith     ii += 7;
1016f1af5d2fSBarry Smith   }
1017f1af5d2fSBarry Smith 
1018f1af5d2fSBarry Smith   /* forward solve the U^T */
1019f1af5d2fSBarry Smith   idx = 0;
1020f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1021f1af5d2fSBarry Smith 
1022f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1023f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1024f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1026f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033f1af5d2fSBarry Smith     v += 49;
1034f1af5d2fSBarry Smith 
1035f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1036f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1037f1af5d2fSBarry Smith     while (nz--) {
1038f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1039f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046f1af5d2fSBarry Smith       v  += 49;
1047f1af5d2fSBarry Smith     }
1048f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1050f1af5d2fSBarry Smith     idx += 7;
1051f1af5d2fSBarry Smith   }
1052f1af5d2fSBarry Smith   /* backward solve the L^T */
1053f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1054f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1055f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1056f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1057f1af5d2fSBarry Smith     idt  = 7*i;
1058f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1062f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v -= 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith   }
1072f1af5d2fSBarry Smith 
1073f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1074f1af5d2fSBarry Smith   ii = 0;
1075f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1076f1af5d2fSBarry Smith     ir      = 7*r[i];
1077f1af5d2fSBarry Smith     x[ir]   = t[ii];
1078f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1079f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1080f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1081f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1082f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1083f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1084f1af5d2fSBarry Smith     ii += 7;
1085f1af5d2fSBarry Smith   }
1086f1af5d2fSBarry Smith 
1087f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1093f1af5d2fSBarry Smith }
1094f1af5d2fSBarry Smith 
10954e2b4712SSatish Balay /* ----------------------------------------------------------- */
10964a2ae208SSatish Balay #undef __FUNCT__
10974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
10984e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
10994e2b4712SSatish Balay {
11004e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11014e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11024e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
11034e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
11043f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
110587828ca2SBarry Smith   PetscScalar     *x,*b,*s,*t,*ls;
11064e2b4712SSatish Balay 
11074e2b4712SSatish Balay   PetscFunctionBegin;
1108e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1109e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1110f1af5d2fSBarry Smith   t  = a->solve_work;
11114e2b4712SSatish Balay 
11124e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11134e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11144e2b4712SSatish Balay 
11154e2b4712SSatish Balay   /* forward solve the lower triangular */
111687828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11174e2b4712SSatish Balay   for (i=1; i<n; i++) {
11184e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11194e2b4712SSatish Balay     vi  = aj + ai[i];
11204e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1121f1af5d2fSBarry Smith     s = t + bs*i;
112287828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11234e2b4712SSatish Balay     while (nz--) {
1124f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11254e2b4712SSatish Balay       v += bs2;
11264e2b4712SSatish Balay     }
11274e2b4712SSatish Balay   }
11284e2b4712SSatish Balay   /* backward solve the upper triangular */
1129273d9f13SBarry Smith   ls = a->solve_work + A->n;
11304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11314e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11324e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11334e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
113487828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11354e2b4712SSatish Balay     while (nz--) {
1136f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11374e2b4712SSatish Balay       v += bs2;
11384e2b4712SSatish Balay     }
1139f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
114087828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11414e2b4712SSatish Balay   }
11424e2b4712SSatish Balay 
11434e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11444e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1145e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1146e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1147b0a32e0cSBarry Smith   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
11484e2b4712SSatish Balay   PetscFunctionReturn(0);
11494e2b4712SSatish Balay }
11504e2b4712SSatish Balay 
11514a2ae208SSatish Balay #undef __FUNCT__
11524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
11534e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11544e2b4712SSatish Balay {
11554e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11564e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11574e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
11584e2b4712SSatish Balay   int             *diag = a->diag;
11593f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
116087828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
116187828ca2SBarry Smith   PetscScalar     *x,*b,*t;
11624e2b4712SSatish Balay 
11634e2b4712SSatish Balay   PetscFunctionBegin;
1164e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1165e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166f1af5d2fSBarry Smith   t  = a->solve_work;
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11694e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11704e2b4712SSatish Balay 
11714e2b4712SSatish Balay   /* forward solve the lower triangular */
11724e2b4712SSatish Balay   idx    = 7*(*r++);
1173f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1174f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11764e2b4712SSatish Balay 
11774e2b4712SSatish Balay   for (i=1; i<n; i++) {
11784e2b4712SSatish Balay     v     = aa + 49*ai[i];
11794e2b4712SSatish Balay     vi    = aj + ai[i];
11804e2b4712SSatish Balay     nz    = diag[i] - ai[i];
11814e2b4712SSatish Balay     idx   = 7*(*r++);
1182f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
11844e2b4712SSatish Balay     while (nz--) {
11854e2b4712SSatish Balay       idx   = 7*(*vi++);
1186f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1188f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1189f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
11964e2b4712SSatish Balay       v += 49;
11974e2b4712SSatish Balay     }
11984e2b4712SSatish Balay     idx = 7*i;
1199f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1200f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12024e2b4712SSatish Balay   }
12034e2b4712SSatish Balay   /* backward solve the upper triangular */
12044e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12054e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12064e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12074e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12084e2b4712SSatish Balay     idt  = 7*i;
1209f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1210f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12124e2b4712SSatish Balay     while (nz--) {
12134e2b4712SSatish Balay       idx   = 7*(*vi++);
1214f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1215f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1217f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12244e2b4712SSatish Balay       v += 49;
12254e2b4712SSatish Balay     }
12264e2b4712SSatish Balay     idc = 7*(*c--);
12274e2b4712SSatish Balay     v   = aa + 49*diag[i];
1228f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12424e2b4712SSatish Balay   }
12434e2b4712SSatish Balay 
12444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1246e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1247e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1248b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
12494e2b4712SSatish Balay   PetscFunctionReturn(0);
12504e2b4712SSatish Balay }
12514e2b4712SSatish Balay 
12524a2ae208SSatish Balay #undef __FUNCT__
12534a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
125415091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
125515091d37SBarry Smith {
125615091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
125715091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
125815091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
125915091d37SBarry Smith   MatScalar       *aa=a->a,*v;
126087828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
126115091d37SBarry Smith 
126215091d37SBarry Smith   PetscFunctionBegin;
126315091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
126415091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126515091d37SBarry Smith   /* forward solve the lower triangular */
126615091d37SBarry Smith   idx    = 0;
126715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
126815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
126915091d37SBarry Smith   x[6] = b[6+idx];
127015091d37SBarry Smith   for (i=1; i<n; i++) {
127115091d37SBarry Smith     v     =  aa + 49*ai[i];
127215091d37SBarry Smith     vi    =  aj + ai[i];
127315091d37SBarry Smith     nz    =  diag[i] - ai[i];
127415091d37SBarry Smith     idx   =  7*i;
1275f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277f1af5d2fSBarry Smith     s7  =  b[6+idx];
127815091d37SBarry Smith     while (nz--) {
127915091d37SBarry Smith       jdx   = 7*(*vi++);
128015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
128115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
128215091d37SBarry Smith       x7    = x[6+jdx];
1283f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
129015091d37SBarry Smith       v += 49;
129115091d37SBarry Smith      }
1292f1af5d2fSBarry Smith     x[idx]   = s1;
1293f1af5d2fSBarry Smith     x[1+idx] = s2;
1294f1af5d2fSBarry Smith     x[2+idx] = s3;
1295f1af5d2fSBarry Smith     x[3+idx] = s4;
1296f1af5d2fSBarry Smith     x[4+idx] = s5;
1297f1af5d2fSBarry Smith     x[5+idx] = s6;
1298f1af5d2fSBarry Smith     x[6+idx] = s7;
129915091d37SBarry Smith   }
130015091d37SBarry Smith   /* backward solve the upper triangular */
130115091d37SBarry Smith   for (i=n-1; i>=0; i--){
130215091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
130315091d37SBarry Smith     vi   = aj + diag[i] + 1;
130415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
130515091d37SBarry Smith     idt  = 7*i;
1306f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1307f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1308f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1309f1af5d2fSBarry Smith     s7 = x[6+idt];
131015091d37SBarry Smith     while (nz--) {
131115091d37SBarry Smith       idx   = 7*(*vi++);
131215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
131315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
131415091d37SBarry Smith       x7    = x[6+idx];
1315f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
132215091d37SBarry Smith       v += 49;
132315091d37SBarry Smith     }
132415091d37SBarry Smith     v        = aa + 49*diag[i];
1325f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
133915091d37SBarry Smith   }
134015091d37SBarry Smith 
134115091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
134215091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1343b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
134415091d37SBarry Smith   PetscFunctionReturn(0);
134515091d37SBarry Smith }
134615091d37SBarry Smith 
13474a2ae208SSatish Balay #undef __FUNCT__
13484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
134915091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
135015091d37SBarry Smith {
135115091d37SBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
135215091d37SBarry Smith   IS              iscol=a->col,isrow=a->row;
135315091d37SBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
135415091d37SBarry Smith   int             *diag = a->diag;
135515091d37SBarry Smith   MatScalar       *aa=a->a,*v;
135687828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
135715091d37SBarry Smith 
135815091d37SBarry Smith   PetscFunctionBegin;
135915091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
136015091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361f1af5d2fSBarry Smith   t  = a->solve_work;
136215091d37SBarry Smith 
136315091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
136415091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
136515091d37SBarry Smith 
136615091d37SBarry Smith   /* forward solve the lower triangular */
136715091d37SBarry Smith   idx    = 6*(*r++);
1368f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1369f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1370f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
137115091d37SBarry Smith   for (i=1; i<n; i++) {
137215091d37SBarry Smith     v     = aa + 36*ai[i];
137315091d37SBarry Smith     vi    = aj + ai[i];
137415091d37SBarry Smith     nz    = diag[i] - ai[i];
137515091d37SBarry Smith     idx   = 6*(*r++);
1376f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
137815091d37SBarry Smith     while (nz--) {
137915091d37SBarry Smith       idx   = 6*(*vi++);
1380f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
138815091d37SBarry Smith       v += 36;
138915091d37SBarry Smith     }
139015091d37SBarry Smith     idx = 6*i;
1391f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1392f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1393f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
139415091d37SBarry Smith   }
139515091d37SBarry Smith   /* backward solve the upper triangular */
139615091d37SBarry Smith   for (i=n-1; i>=0; i--){
139715091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
139815091d37SBarry Smith     vi   = aj + diag[i] + 1;
139915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
140015091d37SBarry Smith     idt  = 6*i;
1401f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1402f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1403f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
140415091d37SBarry Smith     while (nz--) {
140515091d37SBarry Smith       idx   = 6*(*vi++);
1406f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1407f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1408f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1409f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
141515091d37SBarry Smith       v += 36;
141615091d37SBarry Smith     }
141715091d37SBarry Smith     idc = 6*(*c--);
141815091d37SBarry Smith     v   = aa + 36*diag[i];
1419f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
143115091d37SBarry Smith   }
143215091d37SBarry Smith 
143315091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
143415091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
143515091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
143615091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1437b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
143815091d37SBarry Smith   PetscFunctionReturn(0);
143915091d37SBarry Smith }
144015091d37SBarry Smith 
14414a2ae208SSatish Balay #undef __FUNCT__
14424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
144315091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
144415091d37SBarry Smith {
144515091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
144615091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
144715091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
144815091d37SBarry Smith   MatScalar       *aa=a->a,*v;
144987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
145015091d37SBarry Smith 
145115091d37SBarry Smith   PetscFunctionBegin;
145215091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
145315091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
145415091d37SBarry Smith   /* forward solve the lower triangular */
145515091d37SBarry Smith   idx    = 0;
145615091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
145715091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
145815091d37SBarry Smith   for (i=1; i<n; i++) {
145915091d37SBarry Smith     v     =  aa + 36*ai[i];
146015091d37SBarry Smith     vi    =  aj + ai[i];
146115091d37SBarry Smith     nz    =  diag[i] - ai[i];
146215091d37SBarry Smith     idx   =  6*i;
1463f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
146515091d37SBarry Smith     while (nz--) {
146615091d37SBarry Smith       jdx   = 6*(*vi++);
146715091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
146815091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
147515091d37SBarry Smith       v += 36;
147615091d37SBarry Smith      }
1477f1af5d2fSBarry Smith     x[idx]   = s1;
1478f1af5d2fSBarry Smith     x[1+idx] = s2;
1479f1af5d2fSBarry Smith     x[2+idx] = s3;
1480f1af5d2fSBarry Smith     x[3+idx] = s4;
1481f1af5d2fSBarry Smith     x[4+idx] = s5;
1482f1af5d2fSBarry Smith     x[5+idx] = s6;
148315091d37SBarry Smith   }
148415091d37SBarry Smith   /* backward solve the upper triangular */
148515091d37SBarry Smith   for (i=n-1; i>=0; i--){
148615091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
148715091d37SBarry Smith     vi   = aj + diag[i] + 1;
148815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
148915091d37SBarry Smith     idt  = 6*i;
1490f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1491f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1492f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
149315091d37SBarry Smith     while (nz--) {
149415091d37SBarry Smith       idx   = 6*(*vi++);
149515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
149615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150315091d37SBarry Smith       v += 36;
150415091d37SBarry Smith     }
150515091d37SBarry Smith     v        = aa + 36*diag[i];
1506f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
151215091d37SBarry Smith   }
151315091d37SBarry Smith 
151415091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
151515091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1516b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
151715091d37SBarry Smith   PetscFunctionReturn(0);
151815091d37SBarry Smith }
151915091d37SBarry Smith 
15204a2ae208SSatish Balay #undef __FUNCT__
15214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
15224e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15234e2b4712SSatish Balay {
15244e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15254e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
15264e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
15274e2b4712SSatish Balay   int             *diag = a->diag;
15283f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
152987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
15304e2b4712SSatish Balay 
15314e2b4712SSatish Balay   PetscFunctionBegin;
1532e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1533e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534f1af5d2fSBarry Smith   t  = a->solve_work;
15354e2b4712SSatish Balay 
15364e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15374e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15384e2b4712SSatish Balay 
15394e2b4712SSatish Balay   /* forward solve the lower triangular */
15404e2b4712SSatish Balay   idx    = 5*(*r++);
1541f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1542f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15434e2b4712SSatish Balay   for (i=1; i<n; i++) {
15444e2b4712SSatish Balay     v     = aa + 25*ai[i];
15454e2b4712SSatish Balay     vi    = aj + ai[i];
15464e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15474e2b4712SSatish Balay     idx   = 5*(*r++);
1548f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549f1af5d2fSBarry Smith     s5  = b[4+idx];
15504e2b4712SSatish Balay     while (nz--) {
15514e2b4712SSatish Balay       idx   = 5*(*vi++);
1552f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1554f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15594e2b4712SSatish Balay       v += 25;
15604e2b4712SSatish Balay     }
15614e2b4712SSatish Balay     idx = 5*i;
1562f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1563f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15644e2b4712SSatish Balay   }
15654e2b4712SSatish Balay   /* backward solve the upper triangular */
15664e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15674e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15684e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15694e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
15704e2b4712SSatish Balay     idt  = 5*i;
1571f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1572f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
15734e2b4712SSatish Balay     while (nz--) {
15744e2b4712SSatish Balay       idx   = 5*(*vi++);
1575f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1576f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15824e2b4712SSatish Balay       v += 25;
15834e2b4712SSatish Balay     }
15844e2b4712SSatish Balay     idc = 5*(*c--);
15854e2b4712SSatish Balay     v   = aa + 25*diag[i];
1586f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1588f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1590f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1592f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1594f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
15964e2b4712SSatish Balay   }
15974e2b4712SSatish Balay 
15984e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
15994e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1600e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1601e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1602b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
16034e2b4712SSatish Balay   PetscFunctionReturn(0);
16044e2b4712SSatish Balay }
16054e2b4712SSatish Balay 
16064a2ae208SSatish Balay #undef __FUNCT__
16074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
160815091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
160915091d37SBarry Smith {
161015091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
161115091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
161215091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
161315091d37SBarry Smith   MatScalar       *aa=a->a,*v;
161487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
161515091d37SBarry Smith 
161615091d37SBarry Smith   PetscFunctionBegin;
161715091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
161815091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
161915091d37SBarry Smith   /* forward solve the lower triangular */
162015091d37SBarry Smith   idx    = 0;
162115091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
162215091d37SBarry Smith   for (i=1; i<n; i++) {
162315091d37SBarry Smith     v     =  aa + 25*ai[i];
162415091d37SBarry Smith     vi    =  aj + ai[i];
162515091d37SBarry Smith     nz    =  diag[i] - ai[i];
162615091d37SBarry Smith     idx   =  5*i;
1627f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
162815091d37SBarry Smith     while (nz--) {
162915091d37SBarry Smith       jdx   = 5*(*vi++);
163015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
163615091d37SBarry Smith       v    += 25;
163715091d37SBarry Smith     }
1638f1af5d2fSBarry Smith     x[idx]   = s1;
1639f1af5d2fSBarry Smith     x[1+idx] = s2;
1640f1af5d2fSBarry Smith     x[2+idx] = s3;
1641f1af5d2fSBarry Smith     x[3+idx] = s4;
1642f1af5d2fSBarry Smith     x[4+idx] = s5;
164315091d37SBarry Smith   }
164415091d37SBarry Smith   /* backward solve the upper triangular */
164515091d37SBarry Smith   for (i=n-1; i>=0; i--){
164615091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
164715091d37SBarry Smith     vi   = aj + diag[i] + 1;
164815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
164915091d37SBarry Smith     idt  = 5*i;
1650f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1651f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
165215091d37SBarry Smith     while (nz--) {
165315091d37SBarry Smith       idx   = 5*(*vi++);
165415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
166015091d37SBarry Smith       v    += 25;
166115091d37SBarry Smith     }
166215091d37SBarry Smith     v        = aa + 25*diag[i];
1663f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
166815091d37SBarry Smith   }
166915091d37SBarry Smith 
167015091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
167115091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1672b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
167315091d37SBarry Smith   PetscFunctionReturn(0);
167415091d37SBarry Smith }
167515091d37SBarry Smith 
16764a2ae208SSatish Balay #undef __FUNCT__
16774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
16784e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
16794e2b4712SSatish Balay {
16804e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
16814e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
16824e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
16834e2b4712SSatish Balay   int             *diag = a->diag;
16843f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
168587828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
16864e2b4712SSatish Balay 
16874e2b4712SSatish Balay   PetscFunctionBegin;
1688e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1689e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1690f1af5d2fSBarry Smith   t  = a->solve_work;
16914e2b4712SSatish Balay 
16924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
16934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
16944e2b4712SSatish Balay 
16954e2b4712SSatish Balay   /* forward solve the lower triangular */
16964e2b4712SSatish Balay   idx    = 4*(*r++);
1697f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1698f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
16994e2b4712SSatish Balay   for (i=1; i<n; i++) {
17004e2b4712SSatish Balay     v     = aa + 16*ai[i];
17014e2b4712SSatish Balay     vi    = aj + ai[i];
17024e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17034e2b4712SSatish Balay     idx   = 4*(*r++);
1704f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17054e2b4712SSatish Balay     while (nz--) {
17064e2b4712SSatish Balay       idx   = 4*(*vi++);
1707f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17124e2b4712SSatish Balay       v    += 16;
17134e2b4712SSatish Balay     }
17144e2b4712SSatish Balay     idx        = 4*i;
1715f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1716f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17174e2b4712SSatish Balay   }
17184e2b4712SSatish Balay   /* backward solve the upper triangular */
17194e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17204e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17214e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17224e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17234e2b4712SSatish Balay     idt  = 4*i;
1724f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1725f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17264e2b4712SSatish Balay     while (nz--) {
17274e2b4712SSatish Balay       idx   = 4*(*vi++);
1728f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1729f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1730f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17344e2b4712SSatish Balay       v += 16;
17354e2b4712SSatish Balay     }
17364e2b4712SSatish Balay     idc      = 4*(*c--);
17374e2b4712SSatish Balay     v        = aa + 16*diag[i];
1738f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17424e2b4712SSatish Balay   }
17434e2b4712SSatish Balay 
17444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1746e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1747e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1748b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
17494e2b4712SSatish Balay   PetscFunctionReturn(0);
17504e2b4712SSatish Balay }
1751f26ec98cSKris Buschelman 
1752f26ec98cSKris Buschelman #undef __FUNCT__
1753f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1754f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1755f26ec98cSKris Buschelman {
1756f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1757f26ec98cSKris Buschelman   IS              iscol=a->col,isrow=a->row;
1758f26ec98cSKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1759f26ec98cSKris Buschelman   int             *diag = a->diag;
1760f26ec98cSKris Buschelman   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1761f26ec98cSKris Buschelman   PetscScalar     *x,*b;
1762f26ec98cSKris Buschelman 
1763f26ec98cSKris Buschelman   PetscFunctionBegin;
1764f26ec98cSKris Buschelman   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1765f26ec98cSKris Buschelman   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1766f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1767f26ec98cSKris Buschelman 
1768f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1769f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1770f26ec98cSKris Buschelman 
1771f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1772f26ec98cSKris Buschelman   idx    = 4*(*r++);
1773f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1774f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1775f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1776f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1777f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1778f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1779f26ec98cSKris Buschelman     vi    = aj + ai[i];
1780f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1781f26ec98cSKris Buschelman     idx   = 4*(*r++);
1782f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1783f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1784f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1785f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1786f26ec98cSKris Buschelman     while (nz--) {
1787f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1788f26ec98cSKris Buschelman       x1  = t[idx];
1789f26ec98cSKris Buschelman       x2  = t[1+idx];
1790f26ec98cSKris Buschelman       x3  = t[2+idx];
1791f26ec98cSKris Buschelman       x4  = t[3+idx];
1792f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1793f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1794f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1795f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1796f26ec98cSKris Buschelman       v    += 16;
1797f26ec98cSKris Buschelman     }
1798f26ec98cSKris Buschelman     idx        = 4*i;
1799f26ec98cSKris Buschelman     t[idx]   = s1;
1800f26ec98cSKris Buschelman     t[1+idx] = s2;
1801f26ec98cSKris Buschelman     t[2+idx] = s3;
1802f26ec98cSKris Buschelman     t[3+idx] = s4;
1803f26ec98cSKris Buschelman   }
1804f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1805f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1806f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1807f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1808f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1809f26ec98cSKris Buschelman     idt  = 4*i;
1810f26ec98cSKris Buschelman     s1 = t[idt];
1811f26ec98cSKris Buschelman     s2 = t[1+idt];
1812f26ec98cSKris Buschelman     s3 = t[2+idt];
1813f26ec98cSKris Buschelman     s4 = t[3+idt];
1814f26ec98cSKris Buschelman     while (nz--) {
1815f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1816f26ec98cSKris Buschelman       x1  = t[idx];
1817f26ec98cSKris Buschelman       x2  = t[1+idx];
1818f26ec98cSKris Buschelman       x3  = t[2+idx];
1819f26ec98cSKris Buschelman       x4  = t[3+idx];
1820f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1821f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1822f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1823f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1824f26ec98cSKris Buschelman       v += 16;
1825f26ec98cSKris Buschelman     }
1826f26ec98cSKris Buschelman     idc      = 4*(*c--);
1827f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1828f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1829f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1830f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1831f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1832f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1833f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1834f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1835f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1836f26ec98cSKris Buschelman  }
1837f26ec98cSKris Buschelman 
1838f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1839f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1840f26ec98cSKris Buschelman   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1841f26ec98cSKris Buschelman   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1842f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1843f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1844f26ec98cSKris Buschelman }
1845f26ec98cSKris Buschelman 
184624c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
184724c233c2SKris Buschelman 
184824c233c2SKris Buschelman #include PETSC_HAVE_SSE
184924c233c2SKris Buschelman 
185024c233c2SKris Buschelman #undef __FUNCT__
185124c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
185224c233c2SKris Buschelman int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
185324c233c2SKris Buschelman {
185424c233c2SKris Buschelman   /*
185524c233c2SKris Buschelman      Note: This code uses demotion of double
185624c233c2SKris Buschelman      to float when performing the mixed-mode computation.
185724c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
185824c233c2SKris Buschelman   */
185924c233c2SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
186024c233c2SKris Buschelman   IS              iscol=a->col,isrow=a->row;
186124c233c2SKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
186224c233c2SKris Buschelman   int             *diag = a->diag,ai16;
186324c233c2SKris Buschelman   MatScalar       *aa=a->a,*v;
186487828ca2SBarry Smith   PetscScalar     *x,*b,*t;
186524c233c2SKris Buschelman 
186624c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
186724c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
186824c233c2SKris Buschelman   unsigned long   offset;
186924c233c2SKris Buschelman 
187024c233c2SKris Buschelman   PetscFunctionBegin;
187124c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
187224c233c2SKris Buschelman 
187324c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
187424c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
187524c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
187624c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
187724c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
187824c233c2SKris Buschelman 
187924c233c2SKris Buschelman     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188024c233c2SKris Buschelman     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
188124c233c2SKris Buschelman     t  = a->solve_work;
188224c233c2SKris Buschelman 
188324c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
188424c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
188524c233c2SKris Buschelman 
188624c233c2SKris Buschelman     /* forward solve the lower triangular */
188724c233c2SKris Buschelman     idx  = 4*(*r++);
188824c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
188924c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
189024c233c2SKris Buschelman     v    =  aa + 16*ai[1];
189124c233c2SKris Buschelman 
189224c233c2SKris Buschelman     for (i=1; i<n;) {
189324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
189424c233c2SKris Buschelman       vi   =  aj      + ai[i];
189524c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
189624c233c2SKris Buschelman       idx  =  4*(*r++);
189724c233c2SKris Buschelman 
189824c233c2SKris Buschelman       /* Demote sum from double to float */
189924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
190024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
190124c233c2SKris Buschelman 
190224c233c2SKris Buschelman       while (nz--) {
190324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
190424c233c2SKris Buschelman         idx = 4*(*vi++);
190524c233c2SKris Buschelman 
190624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
190724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
190824c233c2SKris Buschelman 
190924c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
191024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
191124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
191224c233c2SKris Buschelman 
191324c233c2SKris Buschelman           /* First Column */
191424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
191524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
191624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
191724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
191824c233c2SKris Buschelman 
191924c233c2SKris Buschelman           /* Second Column */
192024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
192124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
192224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
192324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
192424c233c2SKris Buschelman 
192524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
192624c233c2SKris Buschelman 
192724c233c2SKris Buschelman           /* Third Column */
192824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
192924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
193024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
193124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
193224c233c2SKris Buschelman 
193324c233c2SKris Buschelman           /* Fourth Column */
193424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
193524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
193624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
193724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
193824c233c2SKris Buschelman         SSE_INLINE_END_2
193924c233c2SKris Buschelman 
194024c233c2SKris Buschelman         v  += 16;
194124c233c2SKris Buschelman       }
194224c233c2SKris Buschelman       idx = 4*i;
194324c233c2SKris Buschelman       v   = aa + 16*ai[++i];
194424c233c2SKris Buschelman       PREFETCH_NTA(v);
194524c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
194624c233c2SKris Buschelman 
194724c233c2SKris Buschelman       /* Promote result from float to double */
194824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
194924c233c2SKris Buschelman     }
195024c233c2SKris Buschelman     /* backward solve the upper triangular */
195124c233c2SKris Buschelman     idt  = 4*(n-1);
195224c233c2SKris Buschelman     ai16 = 16*diag[n-1];
195324c233c2SKris Buschelman     v    = aa + ai16 + 16;
195424c233c2SKris Buschelman     for (i=n-1; i>=0;){
195524c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
195624c233c2SKris Buschelman       vi = aj + diag[i] + 1;
195724c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
195824c233c2SKris Buschelman 
195924c233c2SKris Buschelman       /* Demote accumulator from double to float */
196024c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
196124c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
196224c233c2SKris Buschelman 
196324c233c2SKris Buschelman       while (nz--) {
196424c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
196524c233c2SKris Buschelman         idx = 4*(*vi++);
196624c233c2SKris Buschelman 
196724c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
196824c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
196924c233c2SKris Buschelman 
197024c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
197124c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
197224c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
197324c233c2SKris Buschelman 
197424c233c2SKris Buschelman           /* First Column */
197524c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
197624c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
197724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
197824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
197924c233c2SKris Buschelman 
198024c233c2SKris Buschelman           /* Second Column */
198124c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
198224c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
198324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
198424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
198524c233c2SKris Buschelman 
198624c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
198724c233c2SKris Buschelman 
198824c233c2SKris Buschelman           /* Third Column */
198924c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
199024c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
199124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
199224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
199324c233c2SKris Buschelman 
199424c233c2SKris Buschelman           /* Fourth Column */
199524c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
199624c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
199724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
199824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
199924c233c2SKris Buschelman         SSE_INLINE_END_2
200024c233c2SKris Buschelman         v  += 16;
200124c233c2SKris Buschelman       }
200224c233c2SKris Buschelman       v    = aa + ai16;
200324c233c2SKris Buschelman       ai16 = 16*diag[--i];
200424c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
200524c233c2SKris Buschelman       /*
200624c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
200724c233c2SKris Buschelman          which was inverted as part of the factorization
200824c233c2SKris Buschelman       */
200924c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
201024c233c2SKris Buschelman         /* First Column */
201124c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
201224c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
201324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
201424c233c2SKris Buschelman 
201524c233c2SKris Buschelman         /* Second Column */
201624c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
201724c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
201824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
201924c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
202024c233c2SKris Buschelman 
202124c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
202224c233c2SKris Buschelman 
202324c233c2SKris Buschelman         /* Third Column */
202424c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
202524c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
202624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
202724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
202824c233c2SKris Buschelman 
202924c233c2SKris Buschelman         /* Fourth Column */
203024c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
203124c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
203224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
203324c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
203424c233c2SKris Buschelman 
203524c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
203624c233c2SKris Buschelman       SSE_INLINE_END_3
203724c233c2SKris Buschelman 
203824c233c2SKris Buschelman       /* Promote solution from float to double */
203924c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
204024c233c2SKris Buschelman 
204124c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
204224c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
204324c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
204424c233c2SKris Buschelman       idc  = 4*(*c--);
204524c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
204624c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
204724c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
204824c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
204924c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
205024c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
205124c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
205224c233c2SKris Buschelman       SSE_INLINE_END_2
205324c233c2SKris Buschelman       v    = aa + ai16 + 16;
205424c233c2SKris Buschelman       idt -= 4;
205524c233c2SKris Buschelman     }
205624c233c2SKris Buschelman 
205724c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
205824c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
205924c233c2SKris Buschelman     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
206024c233c2SKris Buschelman     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
206124c233c2SKris Buschelman     PetscLogFlops(2*16*(a->nz) - 4*A->n);
206224c233c2SKris Buschelman   SSE_SCOPE_END;
206324c233c2SKris Buschelman   PetscFunctionReturn(0);
206424c233c2SKris Buschelman }
206524c233c2SKris Buschelman 
206624c233c2SKris Buschelman #endif
20670ef38995SBarry Smith 
20680ef38995SBarry Smith 
20694e2b4712SSatish Balay /*
20704e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
20714e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
20724e2b4712SSatish Balay */
20734a2ae208SSatish Balay #undef __FUNCT__
20744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
20754e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
20764e2b4712SSatish Balay {
20774e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
207830d4dcafSBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
207930d4dcafSBarry Smith   int             ierr,*diag = a->diag;
20803f1db9ecSBarry Smith   MatScalar       *aa=a->a;
208187828ca2SBarry Smith   PetscScalar     *x,*b;
20824e2b4712SSatish Balay 
20834e2b4712SSatish Balay   PetscFunctionBegin;
2084e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2085e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
20864e2b4712SSatish Balay 
2087aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
20882853dc0eSBarry Smith   {
208987828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
20902853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
20912853dc0eSBarry Smith   }
2092aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
20932853dc0eSBarry Smith   {
209487828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
20952853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
20962853dc0eSBarry Smith   }
2097aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
20982853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2099e1293385SBarry Smith #else
210030d4dcafSBarry Smith   {
210187828ca2SBarry Smith     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
21023f1db9ecSBarry Smith     MatScalar    *v;
21034e555682SBarry Smith     int          jdx,idt,idx,nz,*vi,i,ai16;
2104e1293385SBarry Smith 
21054e2b4712SSatish Balay   /* forward solve the lower triangular */
21064e2b4712SSatish Balay   idx    = 0;
2107e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
21084e2b4712SSatish Balay   for (i=1; i<n; i++) {
21094e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
21104e2b4712SSatish Balay     vi    =  aj      + ai[i];
21114e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2112e1293385SBarry Smith     idx   +=  4;
2113f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
21144e2b4712SSatish Balay     while (nz--) {
21154e2b4712SSatish Balay       jdx   = 4*(*vi++);
21164e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2117f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2118f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2119f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2120f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
21214e2b4712SSatish Balay       v    += 16;
21224e2b4712SSatish Balay     }
2123f1af5d2fSBarry Smith     x[idx]   = s1;
2124f1af5d2fSBarry Smith     x[1+idx] = s2;
2125f1af5d2fSBarry Smith     x[2+idx] = s3;
2126f1af5d2fSBarry Smith     x[3+idx] = s4;
21274e2b4712SSatish Balay   }
21284e2b4712SSatish Balay   /* backward solve the upper triangular */
21294e555682SBarry Smith   idt = 4*(n-1);
21304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21314e555682SBarry Smith     ai16 = 16*diag[i];
21324e555682SBarry Smith     v    = aa + ai16 + 16;
21334e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21344e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2135f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2136f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
21374e2b4712SSatish Balay     while (nz--) {
21384e2b4712SSatish Balay       idx   = 4*(*vi++);
21394e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2140f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2141f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2142f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2143f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
21444e2b4712SSatish Balay       v    += 16;
21454e2b4712SSatish Balay     }
21464e555682SBarry Smith     v        = aa + ai16;
2147f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2148f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2149f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2150f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2151329f5518SBarry Smith     idt -= 4;
21524e2b4712SSatish Balay   }
215330d4dcafSBarry Smith   }
2154e1293385SBarry Smith #endif
21554e2b4712SSatish Balay 
2156e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2157e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2158b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
21594e2b4712SSatish Balay   PetscFunctionReturn(0);
21604e2b4712SSatish Balay }
21614e2b4712SSatish Balay 
2162f26ec98cSKris Buschelman #undef __FUNCT__
2163f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2164f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2165f26ec98cSKris Buschelman {
2166f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2167f26ec98cSKris Buschelman   int             n=a->mbs,*ai=a->i,*aj=a->j;
2168f26ec98cSKris Buschelman   int             ierr,*diag = a->diag;
2169f26ec98cSKris Buschelman   MatScalar       *aa=a->a;
2170f26ec98cSKris Buschelman   PetscScalar     *x,*b;
2171f26ec98cSKris Buschelman 
2172f26ec98cSKris Buschelman   PetscFunctionBegin;
2173f26ec98cSKris Buschelman   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2174f26ec98cSKris Buschelman   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2175f26ec98cSKris Buschelman 
2176f26ec98cSKris Buschelman   {
2177f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2178f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2179f26ec98cSKris Buschelman     int        jdx,idt,idx,nz,*vi,i,ai16;
2180f26ec98cSKris Buschelman 
2181f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2182f26ec98cSKris Buschelman     idx  = 0;
2183f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2184f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2185f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2186f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2187f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2188f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2189f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2190f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2191f26ec98cSKris Buschelman       idx   +=  4;
2192f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2193f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2194f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2195f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2196f26ec98cSKris Buschelman       while (nz--) {
2197f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2198f26ec98cSKris Buschelman         x1  = t[jdx];
2199f26ec98cSKris Buschelman         x2  = t[1+jdx];
2200f26ec98cSKris Buschelman         x3  = t[2+jdx];
2201f26ec98cSKris Buschelman         x4  = t[3+jdx];
2202f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2203f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2204f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2205f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2206f26ec98cSKris Buschelman         v    += 16;
2207f26ec98cSKris Buschelman       }
2208f26ec98cSKris Buschelman       t[idx]   = s1;
2209f26ec98cSKris Buschelman       t[1+idx] = s2;
2210f26ec98cSKris Buschelman       t[2+idx] = s3;
2211f26ec98cSKris Buschelman       t[3+idx] = s4;
2212f26ec98cSKris Buschelman     }
2213f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2214f26ec98cSKris Buschelman     idt = 4*(n-1);
2215f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2216f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2217f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2218f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2219f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2220f26ec98cSKris Buschelman       s1   = t[idt];
2221f26ec98cSKris Buschelman       s2   = t[1+idt];
2222f26ec98cSKris Buschelman       s3   = t[2+idt];
2223f26ec98cSKris Buschelman       s4   = t[3+idt];
2224f26ec98cSKris Buschelman       while (nz--) {
2225f26ec98cSKris Buschelman         idx = 4*(*vi++);
2226f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2227f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2228f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2229f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2230f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234f26ec98cSKris Buschelman         v    += 16;
2235f26ec98cSKris Buschelman       }
2236f26ec98cSKris Buschelman       v        = aa + ai16;
2237f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2238f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2239f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2240f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2241f26ec98cSKris Buschelman       idt -= 4;
2242f26ec98cSKris Buschelman     }
2243f26ec98cSKris Buschelman   }
2244f26ec98cSKris Buschelman 
2245f26ec98cSKris Buschelman   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2246f26ec98cSKris Buschelman   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2247f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2248f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2249f26ec98cSKris Buschelman }
2250f26ec98cSKris Buschelman 
22513660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
22523660e330SKris Buschelman 
22533660e330SKris Buschelman #include PETSC_HAVE_SSE
22543660e330SKris Buschelman 
22553660e330SKris Buschelman #undef __FUNCT__
22563660e330SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
22573660e330SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
22583660e330SKris Buschelman {
22593660e330SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
22603660e330SKris Buschelman   int             n=a->mbs,*ai=a->i,*aj=a->j;
22613660e330SKris Buschelman   int             ierr,*diag = a->diag;
22623660e330SKris Buschelman   MatScalar       *aa=a->a;
226387828ca2SBarry Smith   PetscScalar     *x,*b;
22643660e330SKris Buschelman 
22653660e330SKris Buschelman   PetscFunctionBegin;
22663660e330SKris Buschelman   SSE_SCOPE_BEGIN;
22673660e330SKris Buschelman   /*
22683660e330SKris Buschelman      Note: This code currently uses demotion of double
22693660e330SKris Buschelman      to float when performing the mixed-mode computation.
22703660e330SKris Buschelman      This may not be numerically reasonable for all applications.
22713660e330SKris Buschelman   */
22723660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
22733660e330SKris Buschelman 
22743660e330SKris Buschelman   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22753660e330SKris Buschelman   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22763660e330SKris Buschelman   {
22773660e330SKris Buschelman     MatScalar     *v;
22783660e330SKris Buschelman     int           jdx,idt,idx,nz,*vi,i,ai16;
22793660e330SKris Buschelman 
22803660e330SKris Buschelman     /* Make space in temp stack for 16 Byte Aligned arrays */
22813660e330SKris Buschelman     float         ssealignedspace[11],*tmps,*tmpx;
22823660e330SKris Buschelman     unsigned long offset = (unsigned long)ssealignedspace % 16;
22833660e330SKris Buschelman     if (offset) offset = (16 - offset)/4;
22843660e330SKris Buschelman     tmps = &ssealignedspace[offset];
22853660e330SKris Buschelman     tmpx = &ssealignedspace[offset+4];
22863660e330SKris Buschelman 
22873660e330SKris Buschelman   /* forward solve the lower triangular */
22883660e330SKris Buschelman     idx  = 0;
22893660e330SKris Buschelman     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
22903660e330SKris Buschelman     v    =  aa + 16*ai[1];
22913660e330SKris Buschelman 
22923660e330SKris Buschelman     for (i=1; i<n;) {
22933660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
22943660e330SKris Buschelman       vi   =  aj      + ai[i];
22953660e330SKris Buschelman       nz   =  diag[i] - ai[i];
22963660e330SKris Buschelman       idx +=  4;
22973660e330SKris Buschelman 
22983660e330SKris Buschelman     /* Demote sum from double to float */
22993660e330SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
23003660e330SKris Buschelman       LOAD_PS(tmps,XMM7);
23013660e330SKris Buschelman 
23023660e330SKris Buschelman       while (nz--) {
23033660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23043660e330SKris Buschelman         jdx = 4*(*vi++);
23053660e330SKris Buschelman 
23063660e330SKris Buschelman         /* Demote solution (so far) from double to float */
23073660e330SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
23083660e330SKris Buschelman 
23093660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
23103660e330SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
23113660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23123660e330SKris Buschelman 
23133660e330SKris Buschelman           /* First Column */
23143660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23153660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23163660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23173660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23183660e330SKris Buschelman 
23193660e330SKris Buschelman           /* Second Column */
23203660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23213660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23223660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23233660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23243660e330SKris Buschelman 
23253660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23263660e330SKris Buschelman 
23273660e330SKris Buschelman           /* Third Column */
23283660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23293660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23303660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23313660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23323660e330SKris Buschelman 
23333660e330SKris Buschelman           /* Fourth Column */
23343660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23353660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23363660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23373660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23383660e330SKris Buschelman         SSE_INLINE_END_2
23393660e330SKris Buschelman 
23403660e330SKris Buschelman         v  += 16;
23413660e330SKris Buschelman       }
23423660e330SKris Buschelman       v    =  aa + 16*ai[++i];
23433660e330SKris Buschelman       PREFETCH_NTA(v);
23443660e330SKris Buschelman       STORE_PS(tmps,XMM7);
23453660e330SKris Buschelman 
23463660e330SKris Buschelman       /* Promote result from float to double */
23473660e330SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
23483660e330SKris Buschelman     }
23493660e330SKris Buschelman     /* backward solve the upper triangular */
23503660e330SKris Buschelman     idt  = 4*(n-1);
23513660e330SKris Buschelman     ai16 = 16*diag[n-1];
23523660e330SKris Buschelman     v    = aa + ai16 + 16;
23533660e330SKris Buschelman     for (i=n-1; i>=0;){
23543660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23553660e330SKris Buschelman       vi = aj + diag[i] + 1;
23563660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
23573660e330SKris Buschelman 
23583660e330SKris Buschelman       /* Demote accumulator from double to float */
23593660e330SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
23603660e330SKris Buschelman       LOAD_PS(tmps,XMM7);
23613660e330SKris Buschelman 
23623660e330SKris Buschelman       while (nz--) {
23633660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23643660e330SKris Buschelman         idx = 4*(*vi++);
23653660e330SKris Buschelman 
23663660e330SKris Buschelman         /* Demote solution (so far) from double to float */
23673660e330SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
23683660e330SKris Buschelman 
23693660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
23703660e330SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
23713660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23723660e330SKris Buschelman 
23733660e330SKris Buschelman           /* First Column */
23743660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23753660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23763660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23773660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23783660e330SKris Buschelman 
23793660e330SKris Buschelman           /* Second Column */
23803660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23813660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23823660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23833660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23843660e330SKris Buschelman 
23853660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23863660e330SKris Buschelman 
23873660e330SKris Buschelman           /* Third Column */
23883660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23893660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23903660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23913660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23923660e330SKris Buschelman 
23933660e330SKris Buschelman           /* Fourth Column */
23943660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23953660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23963660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23973660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23983660e330SKris Buschelman         SSE_INLINE_END_2
23993660e330SKris Buschelman         v  += 16;
24003660e330SKris Buschelman       }
24013660e330SKris Buschelman       v    = aa + ai16;
24023660e330SKris Buschelman       ai16 = 16*diag[--i];
24033660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
24043660e330SKris Buschelman       /*
24053660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
24063660e330SKris Buschelman          which was inverted as part of the factorization
24073660e330SKris Buschelman       */
24083660e330SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
24093660e330SKris Buschelman         /* First Column */
24103660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
24113660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
24123660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
24133660e330SKris Buschelman 
24143660e330SKris Buschelman         /* Second Column */
24153660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
24163660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
24173660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
24183660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
24193660e330SKris Buschelman 
24203660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
24213660e330SKris Buschelman 
24223660e330SKris Buschelman         /* Third Column */
24233660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
24243660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
24253660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
24263660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
24273660e330SKris Buschelman 
24283660e330SKris Buschelman         /* Fourth Column */
24293660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
24303660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
24313660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
24323660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
24333660e330SKris Buschelman 
24343660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
24353660e330SKris Buschelman       SSE_INLINE_END_3
24363660e330SKris Buschelman 
24373660e330SKris Buschelman       /* Promote solution from float to double */
24383660e330SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
24393660e330SKris Buschelman 
24403660e330SKris Buschelman       v    = aa + ai16 + 16;
24413660e330SKris Buschelman       idt -= 4;
24423660e330SKris Buschelman     }
24433660e330SKris Buschelman   }
24443660e330SKris Buschelman   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
24453660e330SKris Buschelman   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24463660e330SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
24473660e330SKris Buschelman   SSE_SCOPE_END;
24483660e330SKris Buschelman   PetscFunctionReturn(0);
24493660e330SKris Buschelman }
24503660e330SKris Buschelman 
24513660e330SKris Buschelman #endif
24524a2ae208SSatish Balay #undef __FUNCT__
24534a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
24544e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
24554e2b4712SSatish Balay {
24564e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
24574e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
24584e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
24594e2b4712SSatish Balay   int             *diag = a->diag;
24603f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
246187828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
24624e2b4712SSatish Balay 
24634e2b4712SSatish Balay   PetscFunctionBegin;
2464e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2465e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2466f1af5d2fSBarry Smith   t  = a->solve_work;
24674e2b4712SSatish Balay 
24684e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
24694e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
24704e2b4712SSatish Balay 
24714e2b4712SSatish Balay   /* forward solve the lower triangular */
24724e2b4712SSatish Balay   idx    = 3*(*r++);
2473f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
24744e2b4712SSatish Balay   for (i=1; i<n; i++) {
24754e2b4712SSatish Balay     v     = aa + 9*ai[i];
24764e2b4712SSatish Balay     vi    = aj + ai[i];
24774e2b4712SSatish Balay     nz    = diag[i] - ai[i];
24784e2b4712SSatish Balay     idx   = 3*(*r++);
2479f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
24804e2b4712SSatish Balay     while (nz--) {
24814e2b4712SSatish Balay       idx   = 3*(*vi++);
2482f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2483f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2484f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2485f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
24864e2b4712SSatish Balay       v += 9;
24874e2b4712SSatish Balay     }
24884e2b4712SSatish Balay     idx = 3*i;
2489f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
24904e2b4712SSatish Balay   }
24914e2b4712SSatish Balay   /* backward solve the upper triangular */
24924e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
24934e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
24944e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
24954e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
24964e2b4712SSatish Balay     idt  = 3*i;
2497f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
24984e2b4712SSatish Balay     while (nz--) {
24994e2b4712SSatish Balay       idx   = 3*(*vi++);
2500f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2501f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2502f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2503f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
25044e2b4712SSatish Balay       v += 9;
25054e2b4712SSatish Balay     }
25064e2b4712SSatish Balay     idc = 3*(*c--);
25074e2b4712SSatish Balay     v   = aa + 9*diag[i];
2508f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2509f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2510f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
25114e2b4712SSatish Balay   }
25124e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
25134e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2514e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2515e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2516b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
25174e2b4712SSatish Balay   PetscFunctionReturn(0);
25184e2b4712SSatish Balay }
25194e2b4712SSatish Balay 
252015091d37SBarry Smith /*
252115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
252215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
252315091d37SBarry Smith */
25244a2ae208SSatish Balay #undef __FUNCT__
25254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
252615091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
252715091d37SBarry Smith {
252815091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
252915091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
253015091d37SBarry Smith   int             ierr,*diag = a->diag;
253115091d37SBarry Smith   MatScalar       *aa=a->a,*v;
253287828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
253315091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
253415091d37SBarry Smith 
253515091d37SBarry Smith   PetscFunctionBegin;
253615091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
253715091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253815091d37SBarry Smith 
253915091d37SBarry Smith 
254015091d37SBarry Smith   /* forward solve the lower triangular */
254115091d37SBarry Smith   idx    = 0;
254215091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
254315091d37SBarry Smith   for (i=1; i<n; i++) {
254415091d37SBarry Smith     v     =  aa      + 9*ai[i];
254515091d37SBarry Smith     vi    =  aj      + ai[i];
254615091d37SBarry Smith     nz    =  diag[i] - ai[i];
254715091d37SBarry Smith     idx   +=  3;
2548f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
254915091d37SBarry Smith     while (nz--) {
255015091d37SBarry Smith       jdx   = 3*(*vi++);
255115091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2552f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2553f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2554f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
255515091d37SBarry Smith       v    += 9;
255615091d37SBarry Smith     }
2557f1af5d2fSBarry Smith     x[idx]   = s1;
2558f1af5d2fSBarry Smith     x[1+idx] = s2;
2559f1af5d2fSBarry Smith     x[2+idx] = s3;
256015091d37SBarry Smith   }
256115091d37SBarry Smith   /* backward solve the upper triangular */
256215091d37SBarry Smith   for (i=n-1; i>=0; i--){
256315091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
256415091d37SBarry Smith     vi   = aj + diag[i] + 1;
256515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
256615091d37SBarry Smith     idt  = 3*i;
2567f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2568f1af5d2fSBarry Smith     s3 = x[2+idt];
256915091d37SBarry Smith     while (nz--) {
257015091d37SBarry Smith       idx   = 3*(*vi++);
257115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2572f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2573f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2574f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
257515091d37SBarry Smith       v    += 9;
257615091d37SBarry Smith     }
257715091d37SBarry Smith     v        = aa +  9*diag[i];
2578f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2579f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2580f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
258115091d37SBarry Smith   }
258215091d37SBarry Smith 
258315091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
258415091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2585b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
258615091d37SBarry Smith   PetscFunctionReturn(0);
258715091d37SBarry Smith }
258815091d37SBarry Smith 
25894a2ae208SSatish Balay #undef __FUNCT__
25904a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
25914e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
25924e2b4712SSatish Balay {
25934e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
25944e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
25954e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
25964e2b4712SSatish Balay   int             *diag = a->diag;
25973f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
259887828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
25994e2b4712SSatish Balay 
26004e2b4712SSatish Balay   PetscFunctionBegin;
2601e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2602e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2603f1af5d2fSBarry Smith   t  = a->solve_work;
26044e2b4712SSatish Balay 
26054e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
26064e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26074e2b4712SSatish Balay 
26084e2b4712SSatish Balay   /* forward solve the lower triangular */
26094e2b4712SSatish Balay   idx    = 2*(*r++);
2610f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
26114e2b4712SSatish Balay   for (i=1; i<n; i++) {
26124e2b4712SSatish Balay     v     = aa + 4*ai[i];
26134e2b4712SSatish Balay     vi    = aj + ai[i];
26144e2b4712SSatish Balay     nz    = diag[i] - ai[i];
26154e2b4712SSatish Balay     idx   = 2*(*r++);
2616f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
26174e2b4712SSatish Balay     while (nz--) {
26184e2b4712SSatish Balay       idx   = 2*(*vi++);
2619f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2620f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2621f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
26224e2b4712SSatish Balay       v += 4;
26234e2b4712SSatish Balay     }
26244e2b4712SSatish Balay     idx = 2*i;
2625f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
26264e2b4712SSatish Balay   }
26274e2b4712SSatish Balay   /* backward solve the upper triangular */
26284e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
26294e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
26304e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26314e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26324e2b4712SSatish Balay     idt  = 2*i;
2633f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
26344e2b4712SSatish Balay     while (nz--) {
26354e2b4712SSatish Balay       idx   = 2*(*vi++);
2636f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2637f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2638f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
26394e2b4712SSatish Balay       v += 4;
26404e2b4712SSatish Balay     }
26414e2b4712SSatish Balay     idc = 2*(*c--);
26424e2b4712SSatish Balay     v   = aa + 4*diag[i];
2643f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2644f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
26454e2b4712SSatish Balay   }
26464e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26474e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2648e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2649e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2650b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
26514e2b4712SSatish Balay   PetscFunctionReturn(0);
26524e2b4712SSatish Balay }
26534e2b4712SSatish Balay 
265415091d37SBarry Smith /*
265515091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
265615091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
265715091d37SBarry Smith */
26584a2ae208SSatish Balay #undef __FUNCT__
26594a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
266015091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
266115091d37SBarry Smith {
266215091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
266315091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
266415091d37SBarry Smith   int             ierr,*diag = a->diag;
266515091d37SBarry Smith   MatScalar       *aa=a->a,*v;
266687828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2;
266715091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
266815091d37SBarry Smith 
266915091d37SBarry Smith   PetscFunctionBegin;
267015091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
267115091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
267215091d37SBarry Smith 
267315091d37SBarry Smith   /* forward solve the lower triangular */
267415091d37SBarry Smith   idx    = 0;
267515091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
267615091d37SBarry Smith   for (i=1; i<n; i++) {
267715091d37SBarry Smith     v     =  aa      + 4*ai[i];
267815091d37SBarry Smith     vi    =  aj      + ai[i];
267915091d37SBarry Smith     nz    =  diag[i] - ai[i];
268015091d37SBarry Smith     idx   +=  2;
2681f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
268215091d37SBarry Smith     while (nz--) {
268315091d37SBarry Smith       jdx   = 2*(*vi++);
268415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2685f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2686f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
268715091d37SBarry Smith       v    += 4;
268815091d37SBarry Smith     }
2689f1af5d2fSBarry Smith     x[idx]   = s1;
2690f1af5d2fSBarry Smith     x[1+idx] = s2;
269115091d37SBarry Smith   }
269215091d37SBarry Smith   /* backward solve the upper triangular */
269315091d37SBarry Smith   for (i=n-1; i>=0; i--){
269415091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
269515091d37SBarry Smith     vi   = aj + diag[i] + 1;
269615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
269715091d37SBarry Smith     idt  = 2*i;
2698f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
269915091d37SBarry Smith     while (nz--) {
270015091d37SBarry Smith       idx   = 2*(*vi++);
270115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2702f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2703f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
270415091d37SBarry Smith       v    += 4;
270515091d37SBarry Smith     }
270615091d37SBarry Smith     v        = aa +  4*diag[i];
2707f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2708f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
270915091d37SBarry Smith   }
271015091d37SBarry Smith 
271115091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
271215091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2713b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
271415091d37SBarry Smith   PetscFunctionReturn(0);
271515091d37SBarry Smith }
271615091d37SBarry Smith 
27174a2ae208SSatish Balay #undef __FUNCT__
27184a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
27194e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
27204e2b4712SSatish Balay {
27214e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
27224e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
27234e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
27244e2b4712SSatish Balay   int             *diag = a->diag;
27253f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
272687828ca2SBarry Smith   PetscScalar     *x,*b,s1,*t;
27274e2b4712SSatish Balay 
27284e2b4712SSatish Balay   PetscFunctionBegin;
27294e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
27304e2b4712SSatish Balay 
2731e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2732e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2733f1af5d2fSBarry Smith   t  = a->solve_work;
27344e2b4712SSatish Balay 
27354e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
27364e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
27374e2b4712SSatish Balay 
27384e2b4712SSatish Balay   /* forward solve the lower triangular */
2739f1af5d2fSBarry Smith   t[0] = b[*r++];
27404e2b4712SSatish Balay   for (i=1; i<n; i++) {
27414e2b4712SSatish Balay     v     = aa + ai[i];
27424e2b4712SSatish Balay     vi    = aj + ai[i];
27434e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2744f1af5d2fSBarry Smith     s1  = b[*r++];
27454e2b4712SSatish Balay     while (nz--) {
2746f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
27474e2b4712SSatish Balay     }
2748f1af5d2fSBarry Smith     t[i] = s1;
27494e2b4712SSatish Balay   }
27504e2b4712SSatish Balay   /* backward solve the upper triangular */
27514e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
27524e2b4712SSatish Balay     v    = aa + diag[i] + 1;
27534e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
27544e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2755f1af5d2fSBarry Smith     s1 = t[i];
27564e2b4712SSatish Balay     while (nz--) {
2757f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
27584e2b4712SSatish Balay     }
2759f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
27604e2b4712SSatish Balay   }
27614e2b4712SSatish Balay 
27624e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27634e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2764e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2765e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2766b0a32e0cSBarry Smith   PetscLogFlops(2*1*(a->nz) - A->n);
27674e2b4712SSatish Balay   PetscFunctionReturn(0);
27684e2b4712SSatish Balay }
276915091d37SBarry Smith /*
277015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
277115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
277215091d37SBarry Smith */
27734a2ae208SSatish Balay #undef __FUNCT__
27744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
277515091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
277615091d37SBarry Smith {
277715091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
277815091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
277915091d37SBarry Smith   int             ierr,*diag = a->diag;
278015091d37SBarry Smith   MatScalar       *aa=a->a;
278187828ca2SBarry Smith   PetscScalar     *x,*b;
278287828ca2SBarry Smith   PetscScalar     s1,x1;
278315091d37SBarry Smith   MatScalar       *v;
278415091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
278515091d37SBarry Smith 
278615091d37SBarry Smith   PetscFunctionBegin;
278715091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
278815091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
278915091d37SBarry Smith 
279015091d37SBarry Smith   /* forward solve the lower triangular */
279115091d37SBarry Smith   idx    = 0;
279215091d37SBarry Smith   x[0]   = b[0];
279315091d37SBarry Smith   for (i=1; i<n; i++) {
279415091d37SBarry Smith     v     =  aa      + ai[i];
279515091d37SBarry Smith     vi    =  aj      + ai[i];
279615091d37SBarry Smith     nz    =  diag[i] - ai[i];
279715091d37SBarry Smith     idx   +=  1;
2798f1af5d2fSBarry Smith     s1  =  b[idx];
279915091d37SBarry Smith     while (nz--) {
280015091d37SBarry Smith       jdx   = *vi++;
280115091d37SBarry Smith       x1    = x[jdx];
2802f1af5d2fSBarry Smith       s1 -= v[0]*x1;
280315091d37SBarry Smith       v    += 1;
280415091d37SBarry Smith     }
2805f1af5d2fSBarry Smith     x[idx]   = s1;
280615091d37SBarry Smith   }
280715091d37SBarry Smith   /* backward solve the upper triangular */
280815091d37SBarry Smith   for (i=n-1; i>=0; i--){
280915091d37SBarry Smith     v    = aa + diag[i] + 1;
281015091d37SBarry Smith     vi   = aj + diag[i] + 1;
281115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
281215091d37SBarry Smith     idt  = i;
2813f1af5d2fSBarry Smith     s1 = x[idt];
281415091d37SBarry Smith     while (nz--) {
281515091d37SBarry Smith       idx   = *vi++;
281615091d37SBarry Smith       x1    = x[idx];
2817f1af5d2fSBarry Smith       s1 -= v[0]*x1;
281815091d37SBarry Smith       v    += 1;
281915091d37SBarry Smith     }
282015091d37SBarry Smith     v        = aa +  diag[i];
2821f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
282215091d37SBarry Smith   }
282315091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
282415091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2825b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
282615091d37SBarry Smith   PetscFunctionReturn(0);
282715091d37SBarry Smith }
28284e2b4712SSatish Balay 
28294e2b4712SSatish Balay /* ----------------------------------------------------------------*/
28304e2b4712SSatish Balay /*
28314e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
28324e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
28334e2b4712SSatish Balay    Not a good example of code reuse.
28344e2b4712SSatish Balay */
2835ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2836435faa5fSBarry Smith 
28374a2ae208SSatish Balay #undef __FUNCT__
28384a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
2839435faa5fSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
28404e2b4712SSatish Balay {
28414e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
28424e2b4712SSatish Balay   IS          isicol;
28434e2b4712SSatish Balay   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
28444e2b4712SSatish Balay   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2845335d9088SBarry Smith   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2846435faa5fSBarry Smith   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
28474533b203SBarry Smith   PetscTruth  col_identity,row_identity;
2848329f5518SBarry Smith   PetscReal   f;
28494e2b4712SSatish Balay 
28504e2b4712SSatish Balay   PetscFunctionBegin;
2851435faa5fSBarry Smith   f             = info->fill;
2852335d9088SBarry Smith   levels        = (int)info->levels;
2853335d9088SBarry Smith   diagonal_fill = (int)info->diagonal_fill;
28544c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2855667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2856667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2857309c388cSBarry Smith 
2858309c388cSBarry Smith   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2859bb3d539aSBarry Smith     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2860bb3d539aSBarry Smith     (*fact)->factor = FACTOR_LU;
2861bb3d539aSBarry Smith     b               = (Mat_SeqBAIJ*)(*fact)->data;
2862bb3d539aSBarry Smith     if (!b->diag) {
2863bb3d539aSBarry Smith       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2864bb3d539aSBarry Smith     }
2865bb3d539aSBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2866bb3d539aSBarry Smith     b->row        = isrow;
2867bb3d539aSBarry Smith     b->col        = iscol;
2868bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2869bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2870bb3d539aSBarry Smith     b->icol       = isicol;
2871bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
287287828ca2SBarry Smith     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2873309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
28744e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
28754e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
28764e2b4712SSatish Balay 
28774e2b4712SSatish Balay     /* get new row pointers */
2878b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
28794e2b4712SSatish Balay     ainew[0] = 0;
28804e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
28814e2b4712SSatish Balay     jmax = (int)(f*ai[n] + 1);
288282502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
28834e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
288482502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
28854e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
2886b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
28874e2b4712SSatish Balay     /* im is level for each filled value */
2888b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
28894e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
2890b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
28914e2b4712SSatish Balay     dloc[0]  = 0;
28924e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
2893435faa5fSBarry Smith 
2894435faa5fSBarry Smith       /* copy prow into linked list */
28954e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
289629bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
28974e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
28984e2b4712SSatish Balay       fill[n]    = n;
2899435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
29004e2b4712SSatish Balay       while (nz--) {
29014e2b4712SSatish Balay 	fm  = n;
29024e2b4712SSatish Balay 	idx = ic[*xi++];
29034e2b4712SSatish Balay 	do {
29044e2b4712SSatish Balay 	  m  = fm;
29054e2b4712SSatish Balay 	  fm = fill[m];
29064e2b4712SSatish Balay 	} while (fm < idx);
29074e2b4712SSatish Balay 	fill[m]   = idx;
29084e2b4712SSatish Balay 	fill[idx] = fm;
29094e2b4712SSatish Balay 	im[idx]   = 0;
29104e2b4712SSatish Balay       }
2911435faa5fSBarry Smith 
2912435faa5fSBarry Smith       /* make sure diagonal entry is included */
2913435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
2914435faa5fSBarry Smith 	fm = n;
2915435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
2916435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
2917435faa5fSBarry Smith 	fill[fm]   = prow;
2918435faa5fSBarry Smith 	im[prow]   = 0;
2919435faa5fSBarry Smith 	nzf++;
2920335d9088SBarry Smith 	dcount++;
2921435faa5fSBarry Smith       }
2922435faa5fSBarry Smith 
29234e2b4712SSatish Balay       nzi = 0;
29244e2b4712SSatish Balay       row = fill[n];
29254e2b4712SSatish Balay       while (row < prow) {
29264e2b4712SSatish Balay 	incrlev = im[row] + 1;
29274e2b4712SSatish Balay 	nz      = dloc[row];
2928435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
29294e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
29304e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
29314e2b4712SSatish Balay 	fm      = row;
29324e2b4712SSatish Balay 	while (nnz-- > 0) {
29334e2b4712SSatish Balay 	  idx = *xi++;
29344e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
29354e2b4712SSatish Balay 	    flev++;
29364e2b4712SSatish Balay 	    continue;
29374e2b4712SSatish Balay 	  }
29384e2b4712SSatish Balay 	  do {
29394e2b4712SSatish Balay 	    m  = fm;
29404e2b4712SSatish Balay 	    fm = fill[m];
29414e2b4712SSatish Balay 	  } while (fm < idx);
29424e2b4712SSatish Balay 	  if (fm != idx) {
29434e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
29444e2b4712SSatish Balay 	    fill[m]   = idx;
29454e2b4712SSatish Balay 	    fill[idx] = fm;
29464e2b4712SSatish Balay 	    fm        = idx;
29474e2b4712SSatish Balay 	    nzf++;
2948ecf371e4SBarry Smith 	  } else {
29494e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
29504e2b4712SSatish Balay 	  }
29514e2b4712SSatish Balay 	  flev++;
29524e2b4712SSatish Balay 	}
29534e2b4712SSatish Balay 	row = fill[row];
29544e2b4712SSatish Balay 	nzi++;
29554e2b4712SSatish Balay       }
29564e2b4712SSatish Balay       /* copy new filled row into permanent storage */
29574e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
29584e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
2959ecf371e4SBarry Smith 
2960ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
2961ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2962ecf371e4SBarry Smith 	/* just double the memory each time */
2963ecf371e4SBarry Smith 	int maxadd = jmax;
2964ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
29654e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
29664e2b4712SSatish Balay 	jmax += maxadd;
2967ecf371e4SBarry Smith 
2968ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
296982502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2970549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2971606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
29724e2b4712SSatish Balay 	ajnew = xi;
297382502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2974549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2975606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
29764e2b4712SSatish Balay 	ajfill = xi;
2977ecf371e4SBarry Smith 	realloc++; /* count how many reallocations are needed */
29784e2b4712SSatish Balay       }
29794e2b4712SSatish Balay       xi          = ajnew + ainew[prow];
29804e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
29814e2b4712SSatish Balay       dloc[prow]  = nzi;
29824e2b4712SSatish Balay       fm          = fill[n];
29834e2b4712SSatish Balay       while (nzf--) {
29844e2b4712SSatish Balay 	*xi++   = fm;
29854e2b4712SSatish Balay 	*flev++ = im[fm];
29864e2b4712SSatish Balay 	fm      = fill[fm];
29874e2b4712SSatish Balay       }
2988435faa5fSBarry Smith       /* make sure row has diagonal entry */
2989435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
299029bbc08cSBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
2991435faa5fSBarry Smith     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2992435faa5fSBarry Smith       }
29934e2b4712SSatish Balay     }
2994606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
29954e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
29964e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2997606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
2998606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
29994e2b4712SSatish Balay 
30004e2b4712SSatish Balay     {
3001329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3002b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
3003b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3004b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3005b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3006335d9088SBarry Smith       if (diagonal_fill) {
3007b1bcba4aSBarry Smith 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3008335d9088SBarry Smith       }
30094e2b4712SSatish Balay     }
30104e2b4712SSatish Balay 
30114e2b4712SSatish Balay     /* put together the new matrix */
30124e2b4712SSatish Balay     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
3013b0a32e0cSBarry Smith     PetscLogObjectParent(*fact,isicol);
30144e2b4712SSatish Balay     b = (Mat_SeqBAIJ*)(*fact)->data;
3015606d414cSSatish Balay     ierr = PetscFree(b->imax);CHKERRQ(ierr);
30167c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
30173f1db9ecSBarry Smith     len = bs2*ainew[n]*sizeof(MatScalar);
30184e2b4712SSatish Balay     /* the next line frees the default space generated by the Create() */
3019606d414cSSatish Balay     ierr = PetscFree(b->a);CHKERRQ(ierr);
3020606d414cSSatish Balay     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
302182502324SSatish Balay     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
30224e2b4712SSatish Balay     b->j          = ajnew;
30234e2b4712SSatish Balay     b->i          = ainew;
30244e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
30254e2b4712SSatish Balay     b->diag       = dloc;
30264e2b4712SSatish Balay     b->ilen       = 0;
30274e2b4712SSatish Balay     b->imax       = 0;
30284e2b4712SSatish Balay     b->row        = isrow;
30294e2b4712SSatish Balay     b->col        = iscol;
3030bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3031c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3032c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3033e51c0b9cSSatish Balay     b->icol       = isicol;
303487828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
30354e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
30364e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
303787828ca2SBarry Smith     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
30384e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
30394e2b4712SSatish Balay     (*fact)->factor   = FACTOR_LU;
30404e2b4712SSatish Balay 
30414e2b4712SSatish Balay     (*fact)->info.factor_mallocs    = realloc;
30424e2b4712SSatish Balay     (*fact)->info.fill_ratio_given  = f;
3043329f5518SBarry Smith     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3044309c388cSBarry Smith   }
30454e2b4712SSatish Balay 
3046309c388cSBarry Smith   if (row_identity && col_identity) {
3047732ee342SKris Buschelman     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
30488661488fSKris Buschelman   }
30498661488fSKris Buschelman   PetscFunctionReturn(0);
30508661488fSKris Buschelman }
30518661488fSKris Buschelman 
3052732ee342SKris Buschelman #undef __FUNCT__
3053732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3054732ee342SKris Buschelman int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
30558661488fSKris Buschelman {
30568661488fSKris Buschelman   /*
30578661488fSKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
30588661488fSKris Buschelman       with natural ordering
30598661488fSKris Buschelman   */
30608661488fSKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
30616b7cc795SKris Buschelman   PetscTruth  sse_enabled_local,sse_enabled_global;
3062732ee342SKris Buschelman   int         ierr;
30638661488fSKris Buschelman 
30648661488fSKris Buschelman   PetscFunctionBegin;
3065a7ba9c3cSKris Buschelman   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3066a7ba9c3cSKris Buschelman   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
30678661488fSKris Buschelman   switch (a->bs) {
30688661488fSKris Buschelman   case 1:
30698661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3070732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3071732ee342SKris Buschelman     break;
3072309c388cSBarry Smith   case 2:
30738661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3074732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3075309c388cSBarry Smith     break;
3076309c388cSBarry Smith   case 3:
30778661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3078732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3079309c388cSBarry Smith     break;
3080309c388cSBarry Smith   case 4:
3081*ccaa8a1bSKris Buschelman     ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
30826b7cc795SKris Buschelman     if (sse_enabled_local) {
3083b988c221SKris Buschelman #if defined(PETSC_HAVE_SSE)
30848661488fSKris Buschelman       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
3085732ee342SKris Buschelman       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n");
3086b988c221SKris Buschelman #else
3087b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3088b988c221SKris Buschelman       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3089b988c221SKris Buschelman #endif
30903ba47ebaSKris Buschelman     } else {
30918661488fSKris Buschelman       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3092732ee342SKris Buschelman       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
30933ba47ebaSKris Buschelman     }
3094309c388cSBarry Smith     break;
3095309c388cSBarry Smith   case 5:
30968661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3097732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3098309c388cSBarry Smith     break;
3099309c388cSBarry Smith   case 6:
31008661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3101732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3102309c388cSBarry Smith     break;
3103309c388cSBarry Smith   case 7:
31048661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3105732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3106309c388cSBarry Smith     break;
3107309c388cSBarry Smith   }
31084e2b4712SSatish Balay   PetscFunctionReturn(0);
31094e2b4712SSatish Balay }
3110732ee342SKris Buschelman 
3111732ee342SKris Buschelman #undef __FUNCT__
3112732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3113732ee342SKris Buschelman int MatSeqBAIJ_UpdateSolvers(Mat A)
3114732ee342SKris Buschelman {
3115732ee342SKris Buschelman   /*
3116732ee342SKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3117732ee342SKris Buschelman       with natural ordering
3118732ee342SKris Buschelman   */
3119732ee342SKris Buschelman   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3120732ee342SKris Buschelman   IS          row = a->row, col = a->col;
3121732ee342SKris Buschelman   PetscTruth  row_identity, col_identity;
312294ee7fc8SKris Buschelman   PetscTruth  use_single, use_natural;
3123732ee342SKris Buschelman   int         ierr;
3124732ee342SKris Buschelman 
3125732ee342SKris Buschelman   PetscFunctionBegin;
3126cf242676SKris Buschelman 
312794ee7fc8SKris Buschelman   use_single  = PETSC_FALSE;
312894ee7fc8SKris Buschelman   use_natural = PETSC_FALSE;
3129cf242676SKris Buschelman 
3130732ee342SKris Buschelman   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3131732ee342SKris Buschelman   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3132732ee342SKris Buschelman 
3133732ee342SKris Buschelman   if (row_identity && col_identity) {
3134732ee342SKris Buschelman     use_natural = PETSC_TRUE;
3135732ee342SKris Buschelman   } else {
3136732ee342SKris Buschelman     use_natural = PETSC_FALSE;
3137732ee342SKris Buschelman   }
3138732ee342SKris Buschelman   switch (a->bs) {
3139732ee342SKris Buschelman   case 1:
3140732ee342SKris Buschelman     if (use_natural) {
3141732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3142732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3143732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3144732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3145732ee342SKris Buschelman     } else {
3146732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1;
3147732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3148732ee342SKris Buschelman     }
3149732ee342SKris Buschelman     break;
3150732ee342SKris Buschelman   case 2:
3151732ee342SKris Buschelman     if (use_natural) {
3152732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3153732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3154732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3155732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3156732ee342SKris Buschelman     } else {
3157732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2;
3158732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3159732ee342SKris Buschelman     }
3160732ee342SKris Buschelman     break;
3161732ee342SKris Buschelman   case 3:
3162732ee342SKris Buschelman     if (use_natural) {
3163732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3164732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3165732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3166732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3167732ee342SKris Buschelman     } else {
3168732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3;
3169732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3170732ee342SKris Buschelman     }
3171732ee342SKris Buschelman     break;
3172732ee342SKris Buschelman   case 4:
3173f26ec98cSKris Buschelman     {
3174f26ec98cSKris Buschelman       PetscTruth sse_enabled_local, sse_enabled_global;
3175cf242676SKris Buschelman       PetscTruth single_prec, flg;
3176cf242676SKris Buschelman       single_prec = flg = PETSC_FALSE;
3177f26ec98cSKris Buschelman 
3178*ccaa8a1bSKris Buschelman       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3179cf242676SKris Buschelman       ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr);
3180cf242676SKris Buschelman       if (flg) {
3181cf242676SKris Buschelman         a->single_precision_solves = single_prec;
3182cf242676SKris Buschelman       }
3183732ee342SKris Buschelman       if (a->single_precision_solves) {
3184732ee342SKris Buschelman         use_single = PETSC_TRUE;
3185732ee342SKris Buschelman       }
3186732ee342SKris Buschelman       if (use_natural) {
31872859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3188732ee342SKris Buschelman         if (use_single) {
3189f26ec98cSKris Buschelman           if (sse_enabled_local) { /* Natural + Single + SSE */
3190732ee342SKris Buschelman             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3191732ee342SKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n");
3192f26ec98cSKris Buschelman           } else { /* Natural + Single */
3193f26ec98cSKris Buschelman             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3194f26ec98cSKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place natural ordering solve BS=4\n");
3195f26ec98cSKris Buschelman           }
3196f26ec98cSKris Buschelman         } else { /* Natural */
3197732ee342SKris Buschelman           A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3198732ee342SKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3199f26ec98cSKris Buschelman         } /* Only one version of SolveTranspose for Natural Ordering */
32002859b196SKris Buschelman #else
32012859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
32022859b196SKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
32032859b196SKris Buschelman #endif
3204732ee342SKris Buschelman         A->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3205732ee342SKris Buschelman         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3206f26ec98cSKris Buschelman       } else { /* Arbitrary ordering */
32072859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3208732ee342SKris Buschelman         if (use_single) {
3209f26ec98cSKris Buschelman           if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3210732ee342SKris Buschelman             A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3211732ee342SKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3212f26ec98cSKris Buschelman           } else { /* Arbitrary + Single */
3213f26ec98cSKris Buschelman             A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3214f26ec98cSKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3215732ee342SKris Buschelman           }
3216f26ec98cSKris Buschelman         } else { /* Arbitrary */
3217f26ec98cSKris Buschelman           A->ops->solve           = MatSolve_SeqBAIJ_4;
3218f26ec98cSKris Buschelman         } /* Only one version of SolveTranspose for Natural Ordering */
32192859b196SKris Buschelman #else
32202859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4;
32212859b196SKris Buschelman #endif
3222732ee342SKris Buschelman         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3223732ee342SKris Buschelman       }
3224f26ec98cSKris Buschelman     }
3225732ee342SKris Buschelman     break;
3226732ee342SKris Buschelman   case 5:
3227732ee342SKris Buschelman     if (use_natural) {
3228732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3229732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3230732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3231732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3232732ee342SKris Buschelman     } else {
3233732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5;
3234732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3235732ee342SKris Buschelman     }
3236732ee342SKris Buschelman     break;
3237732ee342SKris Buschelman   case 6:
3238732ee342SKris Buschelman     if (use_natural) {
3239732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3240732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3241732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3242732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3243732ee342SKris Buschelman     } else {
3244732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6;
3245732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3246732ee342SKris Buschelman     }
3247732ee342SKris Buschelman     break;
3248732ee342SKris Buschelman   case 7:
3249732ee342SKris Buschelman     if (use_natural) {
3250732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3251732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3252732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3253732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3254732ee342SKris Buschelman     } else {
3255732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7;
3256732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3257732ee342SKris Buschelman     }
3258732ee342SKris Buschelman     break;
325931801e53SKris Buschelman   default:
326031801e53SKris Buschelman     A->ops->solve             = MatSolve_SeqBAIJ_N;
326131801e53SKris Buschelman     break;
3262732ee342SKris Buschelman   }
3263732ee342SKris Buschelman   PetscFunctionReturn(0);
3264732ee342SKris Buschelman }
3265732ee342SKris Buschelman 
3266732ee342SKris Buschelman #undef __FUNCT__
3267732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3268732ee342SKris Buschelman int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3269732ee342SKris Buschelman   int ierr;
3270732ee342SKris Buschelman 
3271732ee342SKris Buschelman   PetscFunctionBegin;
3272732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3273cf242676SKris Buschelman   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3274732ee342SKris Buschelman     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3275cf242676SKris Buschelman   } else {
3276cf242676SKris Buschelman     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3277cf242676SKris Buschelman   }
3278732ee342SKris Buschelman   PetscFunctionReturn(0);
3279732ee342SKris Buschelman }
3280732ee342SKris Buschelman 
3281732ee342SKris Buschelman #undef __FUNCT__
3282732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3283732ee342SKris Buschelman int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3284732ee342SKris Buschelman   int ierr;
3285732ee342SKris Buschelman 
3286732ee342SKris Buschelman   PetscFunctionBegin;
3287732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3288732ee342SKris Buschelman   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3289732ee342SKris Buschelman   PetscFunctionReturn(0);
3290732ee342SKris Buschelman }
3291732ee342SKris Buschelman 
3292732ee342SKris Buschelman 
3293