xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 6849ba73f22fecb8f92ef896a42e4e8bd4cd6965)
14e2b4712SSatish Balay /*
24e2b4712SSatish Balay     Factorization code for BAIJ format.
34e2b4712SSatish Balay */
44e2b4712SSatish Balay 
54e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
64e2b4712SSatish Balay #include "src/inline/ilu.h"
774c49faeSBarry Smith #include "src/inline/dot.h"
84e2b4712SSatish Balay 
94a2ae208SSatish Balay #undef __FUNCT__
104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
11dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
12f1af5d2fSBarry Smith {
13f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
14dfbe8321SBarry Smith   PetscErrorCode ierr;
15dfbe8321SBarry Smith   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
16f1af5d2fSBarry Smith   int             *diag = a->diag;
17f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
1887828ca2SBarry Smith   PetscScalar     s1,*x,*b;
19f1af5d2fSBarry Smith 
20f1af5d2fSBarry Smith   PetscFunctionBegin;
21ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
221ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24f1af5d2fSBarry Smith 
25f1af5d2fSBarry Smith   /* forward solve the U^T */
26f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
27f1af5d2fSBarry Smith 
28f1af5d2fSBarry Smith     v     = aa + diag[i];
29f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
30ef66eb69SBarry Smith     s1    = (*v++)*x[i];
31f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
32f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
33f1af5d2fSBarry Smith     while (nz--) {
34f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
35f1af5d2fSBarry Smith     }
36f1af5d2fSBarry Smith     x[i]   = s1;
37f1af5d2fSBarry Smith   }
38f1af5d2fSBarry Smith   /* backward solve the L^T */
39f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
40f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
41f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
42f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
43f1af5d2fSBarry Smith     s1   = x[i];
44f1af5d2fSBarry Smith     while (nz--) {
45f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
46f1af5d2fSBarry Smith     }
47f1af5d2fSBarry Smith   }
481ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
51f1af5d2fSBarry Smith   PetscFunctionReturn(0);
52f1af5d2fSBarry Smith }
53f1af5d2fSBarry Smith 
544a2ae208SSatish Balay #undef __FUNCT__
554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
56dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57f1af5d2fSBarry Smith {
58f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59dfbe8321SBarry Smith   PetscErrorCode ierr;
60dfbe8321SBarry Smith   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
62f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
6387828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
6487828ca2SBarry Smith   PetscScalar     *x,*b;
65f1af5d2fSBarry Smith 
66f1af5d2fSBarry Smith   PetscFunctionBegin;
67ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
681ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
691ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70f1af5d2fSBarry Smith 
71f1af5d2fSBarry Smith   /* forward solve the U^T */
72f1af5d2fSBarry Smith   idx = 0;
73f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
76f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
77ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
78f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
79f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
80f1af5d2fSBarry Smith     v += 4;
81f1af5d2fSBarry Smith 
82f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
83f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
84f1af5d2fSBarry Smith     while (nz--) {
85f1af5d2fSBarry Smith       oidx = 2*(*vi++);
86f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88f1af5d2fSBarry Smith       v  += 4;
89f1af5d2fSBarry Smith     }
90f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
91f1af5d2fSBarry Smith     idx += 2;
92f1af5d2fSBarry Smith   }
93f1af5d2fSBarry Smith   /* backward solve the L^T */
94f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
95f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
96f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
97f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
98f1af5d2fSBarry Smith     idt  = 2*i;
99f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
100f1af5d2fSBarry Smith     while (nz--) {
101f1af5d2fSBarry Smith       idx   = 2*(*vi--);
102f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104f1af5d2fSBarry Smith       v -= 4;
105f1af5d2fSBarry Smith     }
106f1af5d2fSBarry Smith   }
1071ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1081ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110f1af5d2fSBarry Smith   PetscFunctionReturn(0);
111f1af5d2fSBarry Smith }
112f1af5d2fSBarry Smith 
1134a2ae208SSatish Balay #undef __FUNCT__
1144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
115dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116f1af5d2fSBarry Smith {
117f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118dfbe8321SBarry Smith   PetscErrorCode ierr;
119dfbe8321SBarry Smith   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
120f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
121f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
12287828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
12387828ca2SBarry Smith   PetscScalar     *x,*b;
124f1af5d2fSBarry Smith 
125f1af5d2fSBarry Smith   PetscFunctionBegin;
126ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129f1af5d2fSBarry Smith 
130f1af5d2fSBarry Smith   /* forward solve the U^T */
131f1af5d2fSBarry Smith   idx = 0;
132f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
135f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
136ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
137f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
138f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
139f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
140f1af5d2fSBarry Smith     v += 9;
141f1af5d2fSBarry Smith 
142f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
143f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
144f1af5d2fSBarry Smith     while (nz--) {
145f1af5d2fSBarry Smith       oidx = 3*(*vi++);
146f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
147f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
148f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
149f1af5d2fSBarry Smith       v  += 9;
150f1af5d2fSBarry Smith     }
151f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
152f1af5d2fSBarry Smith     idx += 3;
153f1af5d2fSBarry Smith   }
154f1af5d2fSBarry Smith   /* backward solve the L^T */
155f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
156f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
157f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
158f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
159f1af5d2fSBarry Smith     idt  = 3*i;
160f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
161f1af5d2fSBarry Smith     while (nz--) {
162f1af5d2fSBarry Smith       idx   = 3*(*vi--);
163f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
164f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
165f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
166f1af5d2fSBarry Smith       v -= 9;
167f1af5d2fSBarry Smith     }
168f1af5d2fSBarry Smith   }
1691ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
171b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
172f1af5d2fSBarry Smith   PetscFunctionReturn(0);
173f1af5d2fSBarry Smith }
174f1af5d2fSBarry Smith 
1754a2ae208SSatish Balay #undef __FUNCT__
1764a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
177dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
178f1af5d2fSBarry Smith {
179f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
180dfbe8321SBarry Smith   PetscErrorCode ierr;
181dfbe8321SBarry Smith   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
182f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
183f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
18487828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
18587828ca2SBarry Smith   PetscScalar     *x,*b;
186f1af5d2fSBarry Smith 
187f1af5d2fSBarry Smith   PetscFunctionBegin;
188ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1891ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1901ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
191f1af5d2fSBarry Smith 
192f1af5d2fSBarry Smith   /* forward solve the U^T */
193f1af5d2fSBarry Smith   idx = 0;
194f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
197f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
198ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
199f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
200f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
201f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
202f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
203f1af5d2fSBarry Smith     v += 16;
204f1af5d2fSBarry Smith 
205f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
206f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
207f1af5d2fSBarry Smith     while (nz--) {
208f1af5d2fSBarry Smith       oidx = 4*(*vi++);
209f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
210f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
211f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
212f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
213f1af5d2fSBarry Smith       v  += 16;
214f1af5d2fSBarry Smith     }
215f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
216f1af5d2fSBarry Smith     idx += 4;
217f1af5d2fSBarry Smith   }
218f1af5d2fSBarry Smith   /* backward solve the L^T */
219f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
220f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
221f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
222f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
223f1af5d2fSBarry Smith     idt  = 4*i;
224f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
225f1af5d2fSBarry Smith     while (nz--) {
226f1af5d2fSBarry Smith       idx   = 4*(*vi--);
227f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
228f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
229f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
230f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
231f1af5d2fSBarry Smith       v -= 16;
232f1af5d2fSBarry Smith     }
233f1af5d2fSBarry Smith   }
2341ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2351ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
236b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
237f1af5d2fSBarry Smith   PetscFunctionReturn(0);
238f1af5d2fSBarry Smith }
239f1af5d2fSBarry Smith 
2404a2ae208SSatish Balay #undef __FUNCT__
2414a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
242dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
243f1af5d2fSBarry Smith {
244f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
245dfbe8321SBarry Smith   PetscErrorCode ierr;
246dfbe8321SBarry Smith   int  i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
247f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
248f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
24987828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25087828ca2SBarry Smith   PetscScalar     *x,*b;
251f1af5d2fSBarry Smith 
252f1af5d2fSBarry Smith   PetscFunctionBegin;
253ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2541ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2551ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
256f1af5d2fSBarry Smith 
257f1af5d2fSBarry Smith   /* forward solve the U^T */
258f1af5d2fSBarry Smith   idx = 0;
259f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
262f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
263ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
264f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
265f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
266f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
267f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
268f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
269f1af5d2fSBarry Smith     v += 25;
270f1af5d2fSBarry Smith 
271f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
272f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
273f1af5d2fSBarry Smith     while (nz--) {
274f1af5d2fSBarry Smith       oidx = 5*(*vi++);
275f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
276f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
277f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
278f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
279f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
280f1af5d2fSBarry Smith       v  += 25;
281f1af5d2fSBarry Smith     }
282f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
283f1af5d2fSBarry Smith     idx += 5;
284f1af5d2fSBarry Smith   }
285f1af5d2fSBarry Smith   /* backward solve the L^T */
286f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
287f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
288f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
289f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
290f1af5d2fSBarry Smith     idt  = 5*i;
291f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
292f1af5d2fSBarry Smith     while (nz--) {
293f1af5d2fSBarry Smith       idx   = 5*(*vi--);
294f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
295f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
296f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
297f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
298f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
299f1af5d2fSBarry Smith       v -= 25;
300f1af5d2fSBarry Smith     }
301f1af5d2fSBarry Smith   }
3021ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
304b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
305f1af5d2fSBarry Smith   PetscFunctionReturn(0);
306f1af5d2fSBarry Smith }
307f1af5d2fSBarry Smith 
3084a2ae208SSatish Balay #undef __FUNCT__
3094a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
310dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
311f1af5d2fSBarry Smith {
312f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
313dfbe8321SBarry Smith   PetscErrorCode ierr;
314dfbe8321SBarry Smith   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
315f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
316f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
31787828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
31887828ca2SBarry Smith   PetscScalar     *x,*b;
319f1af5d2fSBarry Smith 
320f1af5d2fSBarry Smith   PetscFunctionBegin;
321ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3221ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
324f1af5d2fSBarry Smith 
325f1af5d2fSBarry Smith   /* forward solve the U^T */
326f1af5d2fSBarry Smith   idx = 0;
327f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
330f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
331ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
332ef66eb69SBarry Smith     x6    = x[5+idx];
333f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
334f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
335f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
336f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
337f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
338f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
339f1af5d2fSBarry Smith     v += 36;
340f1af5d2fSBarry Smith 
341f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
342f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
343f1af5d2fSBarry Smith     while (nz--) {
344f1af5d2fSBarry Smith       oidx = 6*(*vi++);
345f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
346f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
347f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
348f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
349f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
350f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
351f1af5d2fSBarry Smith       v  += 36;
352f1af5d2fSBarry Smith     }
353f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
354f1af5d2fSBarry Smith     x[5+idx] = s6;
355f1af5d2fSBarry Smith     idx += 6;
356f1af5d2fSBarry Smith   }
357f1af5d2fSBarry Smith   /* backward solve the L^T */
358f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
359f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
360f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
361f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
362f1af5d2fSBarry Smith     idt  = 6*i;
363f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
364f1af5d2fSBarry Smith     s6 = x[5+idt];
365f1af5d2fSBarry Smith     while (nz--) {
366f1af5d2fSBarry Smith       idx   = 6*(*vi--);
367f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
368f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
369f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
370f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
371f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
372f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
373f1af5d2fSBarry Smith       v -= 36;
374f1af5d2fSBarry Smith     }
375f1af5d2fSBarry Smith   }
3761ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
378b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
379f1af5d2fSBarry Smith   PetscFunctionReturn(0);
380f1af5d2fSBarry Smith }
381f1af5d2fSBarry Smith 
3824a2ae208SSatish Balay #undef __FUNCT__
3834a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
384dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
385f1af5d2fSBarry Smith {
386f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
387dfbe8321SBarry Smith   PetscErrorCode ierr;
388dfbe8321SBarry Smith   int i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
389f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
390f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
39187828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39287828ca2SBarry Smith   PetscScalar     *x,*b;
393f1af5d2fSBarry Smith 
394f1af5d2fSBarry Smith   PetscFunctionBegin;
395ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3961ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3971ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
398f1af5d2fSBarry Smith 
399f1af5d2fSBarry Smith   /* forward solve the U^T */
400f1af5d2fSBarry Smith   idx = 0;
401f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
404f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
405ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
406ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
407f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
408f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
409f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
410f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
411f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
412f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
413f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
414f1af5d2fSBarry Smith     v += 49;
415f1af5d2fSBarry Smith 
416f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
417f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
418f1af5d2fSBarry Smith     while (nz--) {
419f1af5d2fSBarry Smith       oidx = 7*(*vi++);
420f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
421f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
422f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
423f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
424f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
425f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
426f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
427f1af5d2fSBarry Smith       v  += 49;
428f1af5d2fSBarry Smith     }
429f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
430f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
431f1af5d2fSBarry Smith     idx += 7;
432f1af5d2fSBarry Smith   }
433f1af5d2fSBarry Smith   /* backward solve the L^T */
434f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
435f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
436f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
437f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
438f1af5d2fSBarry Smith     idt  = 7*i;
439f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
440f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
441f1af5d2fSBarry Smith     while (nz--) {
442f1af5d2fSBarry Smith       idx   = 7*(*vi--);
443f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
444f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
445f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
446f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
447f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
448f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
449f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
450f1af5d2fSBarry Smith       v -= 49;
451f1af5d2fSBarry Smith     }
452f1af5d2fSBarry Smith   }
4531ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4541ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
455b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
456f1af5d2fSBarry Smith   PetscFunctionReturn(0);
457f1af5d2fSBarry Smith }
458f1af5d2fSBarry Smith 
459f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4604a2ae208SSatish Balay #undef __FUNCT__
4614a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
462dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
463f1af5d2fSBarry Smith {
464f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
465f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
466*6849ba73SBarry Smith   PetscErrorCode ierr;
467*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
468f1af5d2fSBarry Smith   int             *diag = a->diag;
469f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
47087828ca2SBarry Smith   PetscScalar     s1,*x,*b,*t;
471f1af5d2fSBarry Smith 
472f1af5d2fSBarry Smith   PetscFunctionBegin;
4731ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
475f1af5d2fSBarry Smith   t  = a->solve_work;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
478f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
479f1af5d2fSBarry Smith 
480f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
481f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
482f1af5d2fSBarry Smith     t[i] = b[c[i]];
483f1af5d2fSBarry Smith   }
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* forward solve the U^T */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith 
488f1af5d2fSBarry Smith     v     = aa + diag[i];
489f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
490f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
491f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
492f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
493f1af5d2fSBarry Smith     while (nz--) {
494f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
495f1af5d2fSBarry Smith     }
496f1af5d2fSBarry Smith     t[i]   = s1;
497f1af5d2fSBarry Smith   }
498f1af5d2fSBarry Smith   /* backward solve the L^T */
499f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
500f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
501f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
502f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
503f1af5d2fSBarry Smith     s1   = t[i];
504f1af5d2fSBarry Smith     while (nz--) {
505f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
506f1af5d2fSBarry Smith     }
507f1af5d2fSBarry Smith   }
508f1af5d2fSBarry Smith 
509f1af5d2fSBarry Smith   /* copy t into x according to permutation */
510f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
511f1af5d2fSBarry Smith     x[r[i]]   = t[i];
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
515f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5161ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
518b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
519f1af5d2fSBarry Smith   PetscFunctionReturn(0);
520f1af5d2fSBarry Smith }
521f1af5d2fSBarry Smith 
5224a2ae208SSatish Balay #undef __FUNCT__
5234a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
524dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
525f1af5d2fSBarry Smith {
526f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
527f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
528*6849ba73SBarry Smith   PetscErrorCode ierr;
529*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
530f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
531f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
53287828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
53387828ca2SBarry Smith   PetscScalar     *x,*b,*t;
534f1af5d2fSBarry Smith 
535f1af5d2fSBarry Smith   PetscFunctionBegin;
5361ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
538f1af5d2fSBarry Smith   t  = a->solve_work;
539f1af5d2fSBarry Smith 
540f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
541f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
542f1af5d2fSBarry Smith 
543f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
544f1af5d2fSBarry Smith   ii = 0;
545f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
546f1af5d2fSBarry Smith     ic      = 2*c[i];
547f1af5d2fSBarry Smith     t[ii]   = b[ic];
548f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
549f1af5d2fSBarry Smith     ii += 2;
550f1af5d2fSBarry Smith   }
551f1af5d2fSBarry Smith 
552f1af5d2fSBarry Smith   /* forward solve the U^T */
553f1af5d2fSBarry Smith   idx = 0;
554f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
555f1af5d2fSBarry Smith 
556f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
557f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
558f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
559f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
560f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
561f1af5d2fSBarry Smith     v += 4;
562f1af5d2fSBarry Smith 
563f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
564f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
565f1af5d2fSBarry Smith     while (nz--) {
566f1af5d2fSBarry Smith       oidx = 2*(*vi++);
567f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
568f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
569f1af5d2fSBarry Smith       v  += 4;
570f1af5d2fSBarry Smith     }
571f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
572f1af5d2fSBarry Smith     idx += 2;
573f1af5d2fSBarry Smith   }
574f1af5d2fSBarry Smith   /* backward solve the L^T */
575f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
576f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
577f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
578f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
579f1af5d2fSBarry Smith     idt  = 2*i;
580f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
581f1af5d2fSBarry Smith     while (nz--) {
582f1af5d2fSBarry Smith       idx   = 2*(*vi--);
583f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
584f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
585f1af5d2fSBarry Smith       v -= 4;
586f1af5d2fSBarry Smith     }
587f1af5d2fSBarry Smith   }
588f1af5d2fSBarry Smith 
589f1af5d2fSBarry Smith   /* copy t into x according to permutation */
590f1af5d2fSBarry Smith   ii = 0;
591f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
592f1af5d2fSBarry Smith     ir      = 2*r[i];
593f1af5d2fSBarry Smith     x[ir]   = t[ii];
594f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
595f1af5d2fSBarry Smith     ii += 2;
596f1af5d2fSBarry Smith   }
597f1af5d2fSBarry Smith 
598f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
599f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6001ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6011ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
602b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
603f1af5d2fSBarry Smith   PetscFunctionReturn(0);
604f1af5d2fSBarry Smith }
605f1af5d2fSBarry Smith 
6064a2ae208SSatish Balay #undef __FUNCT__
6074a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
608dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
609f1af5d2fSBarry Smith {
610f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
611f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
612*6849ba73SBarry Smith   PetscErrorCode ierr;
613*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
614f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
615f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
61687828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
61787828ca2SBarry Smith   PetscScalar     *x,*b,*t;
618f1af5d2fSBarry Smith 
619f1af5d2fSBarry Smith   PetscFunctionBegin;
6201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
622f1af5d2fSBarry Smith   t  = a->solve_work;
623f1af5d2fSBarry Smith 
624f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
625f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
626f1af5d2fSBarry Smith 
627f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
628f1af5d2fSBarry Smith   ii = 0;
629f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
630f1af5d2fSBarry Smith     ic      = 3*c[i];
631f1af5d2fSBarry Smith     t[ii]   = b[ic];
632f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
633f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
634f1af5d2fSBarry Smith     ii += 3;
635f1af5d2fSBarry Smith   }
636f1af5d2fSBarry Smith 
637f1af5d2fSBarry Smith   /* forward solve the U^T */
638f1af5d2fSBarry Smith   idx = 0;
639f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
640f1af5d2fSBarry Smith 
641f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
642f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
643f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
644f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
645f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
646f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
647f1af5d2fSBarry Smith     v += 9;
648f1af5d2fSBarry Smith 
649f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
650f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
651f1af5d2fSBarry Smith     while (nz--) {
652f1af5d2fSBarry Smith       oidx = 3*(*vi++);
653f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
654f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
655f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
656f1af5d2fSBarry Smith       v  += 9;
657f1af5d2fSBarry Smith     }
658f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
659f1af5d2fSBarry Smith     idx += 3;
660f1af5d2fSBarry Smith   }
661f1af5d2fSBarry Smith   /* backward solve the L^T */
662f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
663f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
664f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
665f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
666f1af5d2fSBarry Smith     idt  = 3*i;
667f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
668f1af5d2fSBarry Smith     while (nz--) {
669f1af5d2fSBarry Smith       idx   = 3*(*vi--);
670f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
671f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
672f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
673f1af5d2fSBarry Smith       v -= 9;
674f1af5d2fSBarry Smith     }
675f1af5d2fSBarry Smith   }
676f1af5d2fSBarry Smith 
677f1af5d2fSBarry Smith   /* copy t into x according to permutation */
678f1af5d2fSBarry Smith   ii = 0;
679f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
680f1af5d2fSBarry Smith     ir      = 3*r[i];
681f1af5d2fSBarry Smith     x[ir]   = t[ii];
682f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
683f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
684f1af5d2fSBarry Smith     ii += 3;
685f1af5d2fSBarry Smith   }
686f1af5d2fSBarry Smith 
687f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
688f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
691b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
692f1af5d2fSBarry Smith   PetscFunctionReturn(0);
693f1af5d2fSBarry Smith }
694f1af5d2fSBarry Smith 
6954a2ae208SSatish Balay #undef __FUNCT__
6964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
697dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
698f1af5d2fSBarry Smith {
699f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
700f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
701*6849ba73SBarry Smith   PetscErrorCode ierr;
702*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
703f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
704f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
70587828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
70687828ca2SBarry Smith   PetscScalar     *x,*b,*t;
707f1af5d2fSBarry Smith 
708f1af5d2fSBarry Smith   PetscFunctionBegin;
7091ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
711f1af5d2fSBarry Smith   t  = a->solve_work;
712f1af5d2fSBarry Smith 
713f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
714f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
717f1af5d2fSBarry Smith   ii = 0;
718f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
719f1af5d2fSBarry Smith     ic      = 4*c[i];
720f1af5d2fSBarry Smith     t[ii]   = b[ic];
721f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
722f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
723f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
724f1af5d2fSBarry Smith     ii += 4;
725f1af5d2fSBarry Smith   }
726f1af5d2fSBarry Smith 
727f1af5d2fSBarry Smith   /* forward solve the U^T */
728f1af5d2fSBarry Smith   idx = 0;
729f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
730f1af5d2fSBarry Smith 
731f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
732f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
733f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
734f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
735f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
736f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
737f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
738f1af5d2fSBarry Smith     v += 16;
739f1af5d2fSBarry Smith 
740f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
741f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
742f1af5d2fSBarry Smith     while (nz--) {
743f1af5d2fSBarry Smith       oidx = 4*(*vi++);
744f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
745f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
746f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
747f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
748f1af5d2fSBarry Smith       v  += 16;
749f1af5d2fSBarry Smith     }
750f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
751f1af5d2fSBarry Smith     idx += 4;
752f1af5d2fSBarry Smith   }
753f1af5d2fSBarry Smith   /* backward solve the L^T */
754f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
755f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
756f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
757f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
758f1af5d2fSBarry Smith     idt  = 4*i;
759f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
760f1af5d2fSBarry Smith     while (nz--) {
761f1af5d2fSBarry Smith       idx   = 4*(*vi--);
762f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
763f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
764f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
765f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
766f1af5d2fSBarry Smith       v -= 16;
767f1af5d2fSBarry Smith     }
768f1af5d2fSBarry Smith   }
769f1af5d2fSBarry Smith 
770f1af5d2fSBarry Smith   /* copy t into x according to permutation */
771f1af5d2fSBarry Smith   ii = 0;
772f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
773f1af5d2fSBarry Smith     ir      = 4*r[i];
774f1af5d2fSBarry Smith     x[ir]   = t[ii];
775f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
776f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
777f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
778f1af5d2fSBarry Smith     ii += 4;
779f1af5d2fSBarry Smith   }
780f1af5d2fSBarry Smith 
781f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
782f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7831ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7841ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
785b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
786f1af5d2fSBarry Smith   PetscFunctionReturn(0);
787f1af5d2fSBarry Smith }
788f1af5d2fSBarry Smith 
7894a2ae208SSatish Balay #undef __FUNCT__
7904a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
791dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
792f1af5d2fSBarry Smith {
793f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
794f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
795*6849ba73SBarry Smith   PetscErrorCode ierr;
796*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
797f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
798f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
79987828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80087828ca2SBarry Smith   PetscScalar     *x,*b,*t;
801f1af5d2fSBarry Smith 
802f1af5d2fSBarry Smith   PetscFunctionBegin;
8031ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8041ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
805f1af5d2fSBarry Smith   t  = a->solve_work;
806f1af5d2fSBarry Smith 
807f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
808f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
809f1af5d2fSBarry Smith 
810f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
811f1af5d2fSBarry Smith   ii = 0;
812f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
813f1af5d2fSBarry Smith     ic      = 5*c[i];
814f1af5d2fSBarry Smith     t[ii]   = b[ic];
815f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
816f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
817f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
818f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
819f1af5d2fSBarry Smith     ii += 5;
820f1af5d2fSBarry Smith   }
821f1af5d2fSBarry Smith 
822f1af5d2fSBarry Smith   /* forward solve the U^T */
823f1af5d2fSBarry Smith   idx = 0;
824f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
825f1af5d2fSBarry Smith 
826f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
827f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
828f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
829f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
830f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
831f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
832f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
833f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
834f1af5d2fSBarry Smith     v += 25;
835f1af5d2fSBarry Smith 
836f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
837f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
838f1af5d2fSBarry Smith     while (nz--) {
839f1af5d2fSBarry Smith       oidx = 5*(*vi++);
840f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
841f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
842f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
843f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
844f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
845f1af5d2fSBarry Smith       v  += 25;
846f1af5d2fSBarry Smith     }
847f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
848f1af5d2fSBarry Smith     idx += 5;
849f1af5d2fSBarry Smith   }
850f1af5d2fSBarry Smith   /* backward solve the L^T */
851f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
852f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
853f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
854f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
855f1af5d2fSBarry Smith     idt  = 5*i;
856f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
857f1af5d2fSBarry Smith     while (nz--) {
858f1af5d2fSBarry Smith       idx   = 5*(*vi--);
859f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
860f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
861f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
862f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
863f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
864f1af5d2fSBarry Smith       v -= 25;
865f1af5d2fSBarry Smith     }
866f1af5d2fSBarry Smith   }
867f1af5d2fSBarry Smith 
868f1af5d2fSBarry Smith   /* copy t into x according to permutation */
869f1af5d2fSBarry Smith   ii = 0;
870f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
871f1af5d2fSBarry Smith     ir      = 5*r[i];
872f1af5d2fSBarry Smith     x[ir]   = t[ii];
873f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
874f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
875f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
876f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
877f1af5d2fSBarry Smith     ii += 5;
878f1af5d2fSBarry Smith   }
879f1af5d2fSBarry Smith 
880f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
881f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8821ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8831ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
884b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
885f1af5d2fSBarry Smith   PetscFunctionReturn(0);
886f1af5d2fSBarry Smith }
887f1af5d2fSBarry Smith 
8884a2ae208SSatish Balay #undef __FUNCT__
8894a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
890dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
891f1af5d2fSBarry Smith {
892f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
893f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
894*6849ba73SBarry Smith   PetscErrorCode ierr;
895*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
896f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
897f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
89887828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
89987828ca2SBarry Smith   PetscScalar     *x,*b,*t;
900f1af5d2fSBarry Smith 
901f1af5d2fSBarry Smith   PetscFunctionBegin;
9021ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9031ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
904f1af5d2fSBarry Smith   t  = a->solve_work;
905f1af5d2fSBarry Smith 
906f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
907f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
908f1af5d2fSBarry Smith 
909f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
910f1af5d2fSBarry Smith   ii = 0;
911f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
912f1af5d2fSBarry Smith     ic      = 6*c[i];
913f1af5d2fSBarry Smith     t[ii]   = b[ic];
914f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
915f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
916f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
917f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
918f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
919f1af5d2fSBarry Smith     ii += 6;
920f1af5d2fSBarry Smith   }
921f1af5d2fSBarry Smith 
922f1af5d2fSBarry Smith   /* forward solve the U^T */
923f1af5d2fSBarry Smith   idx = 0;
924f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
925f1af5d2fSBarry Smith 
926f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
927f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
928f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
929f1af5d2fSBarry Smith     x6    = t[5+idx];
930f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
931f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
932f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
933f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
934f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
935f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
936f1af5d2fSBarry Smith     v += 36;
937f1af5d2fSBarry Smith 
938f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
939f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
940f1af5d2fSBarry Smith     while (nz--) {
941f1af5d2fSBarry Smith       oidx = 6*(*vi++);
942f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
943f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
944f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
945f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
946f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
947f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
948f1af5d2fSBarry Smith       v  += 36;
949f1af5d2fSBarry Smith     }
950f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
951f1af5d2fSBarry Smith     t[5+idx] = s6;
952f1af5d2fSBarry Smith     idx += 6;
953f1af5d2fSBarry Smith   }
954f1af5d2fSBarry Smith   /* backward solve the L^T */
955f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
956f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
957f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
958f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
959f1af5d2fSBarry Smith     idt  = 6*i;
960f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
961f1af5d2fSBarry Smith     s6 = t[5+idt];
962f1af5d2fSBarry Smith     while (nz--) {
963f1af5d2fSBarry Smith       idx   = 6*(*vi--);
964f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
965f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
966f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
967f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
968f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
969f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
970f1af5d2fSBarry Smith       v -= 36;
971f1af5d2fSBarry Smith     }
972f1af5d2fSBarry Smith   }
973f1af5d2fSBarry Smith 
974f1af5d2fSBarry Smith   /* copy t into x according to permutation */
975f1af5d2fSBarry Smith   ii = 0;
976f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
977f1af5d2fSBarry Smith     ir      = 6*r[i];
978f1af5d2fSBarry Smith     x[ir]   = t[ii];
979f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
980f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
981f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
982f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
983f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
984f1af5d2fSBarry Smith     ii += 6;
985f1af5d2fSBarry Smith   }
986f1af5d2fSBarry Smith 
987f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
988f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
991b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
992f1af5d2fSBarry Smith   PetscFunctionReturn(0);
993f1af5d2fSBarry Smith }
994f1af5d2fSBarry Smith 
9954a2ae208SSatish Balay #undef __FUNCT__
9964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
997dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
998f1af5d2fSBarry Smith {
999f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
1000f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
1001*6849ba73SBarry Smith   PetscErrorCode ierr;
1002*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1003f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
1004f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
100587828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
100687828ca2SBarry Smith   PetscScalar     *x,*b,*t;
1007f1af5d2fSBarry Smith 
1008f1af5d2fSBarry Smith   PetscFunctionBegin;
10091ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1011f1af5d2fSBarry Smith   t  = a->solve_work;
1012f1af5d2fSBarry Smith 
1013f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1014f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1015f1af5d2fSBarry Smith 
1016f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1017f1af5d2fSBarry Smith   ii = 0;
1018f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1019f1af5d2fSBarry Smith     ic      = 7*c[i];
1020f1af5d2fSBarry Smith     t[ii]   = b[ic];
1021f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1022f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1023f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1024f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1025f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1026f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1027f1af5d2fSBarry Smith     ii += 7;
1028f1af5d2fSBarry Smith   }
1029f1af5d2fSBarry Smith 
1030f1af5d2fSBarry Smith   /* forward solve the U^T */
1031f1af5d2fSBarry Smith   idx = 0;
1032f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1033f1af5d2fSBarry Smith 
1034f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1035f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1036f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1037f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1038f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1039f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1040f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1041f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1042f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1043f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1044f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1045f1af5d2fSBarry Smith     v += 49;
1046f1af5d2fSBarry Smith 
1047f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1048f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1049f1af5d2fSBarry Smith     while (nz--) {
1050f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1051f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1052f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1053f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1054f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1055f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1056f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1057f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1058f1af5d2fSBarry Smith       v  += 49;
1059f1af5d2fSBarry Smith     }
1060f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1061f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1062f1af5d2fSBarry Smith     idx += 7;
1063f1af5d2fSBarry Smith   }
1064f1af5d2fSBarry Smith   /* backward solve the L^T */
1065f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1066f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1067f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1068f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1069f1af5d2fSBarry Smith     idt  = 7*i;
1070f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1071f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1072f1af5d2fSBarry Smith     while (nz--) {
1073f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1074f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1075f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1076f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1077f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1078f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1079f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1080f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1081f1af5d2fSBarry Smith       v -= 49;
1082f1af5d2fSBarry Smith     }
1083f1af5d2fSBarry Smith   }
1084f1af5d2fSBarry Smith 
1085f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1086f1af5d2fSBarry Smith   ii = 0;
1087f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1088f1af5d2fSBarry Smith     ir      = 7*r[i];
1089f1af5d2fSBarry Smith     x[ir]   = t[ii];
1090f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1091f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1092f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1093f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1094f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1095f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1096f1af5d2fSBarry Smith     ii += 7;
1097f1af5d2fSBarry Smith   }
1098f1af5d2fSBarry Smith 
1099f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1100f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11011ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1103b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1104f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1105f1af5d2fSBarry Smith }
1106f1af5d2fSBarry Smith 
11074e2b4712SSatish Balay /* ----------------------------------------------------------- */
11084a2ae208SSatish Balay #undef __FUNCT__
11094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1110dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11114e2b4712SSatish Balay {
11124e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11134e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
1114*6849ba73SBarry Smith   PetscErrorCode ierr;
1115*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
11164e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
11173f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
111887828ca2SBarry Smith   PetscScalar     *x,*b,*s,*t,*ls;
11194e2b4712SSatish Balay 
11204e2b4712SSatish Balay   PetscFunctionBegin;
11211ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1123f1af5d2fSBarry Smith   t  = a->solve_work;
11244e2b4712SSatish Balay 
11254e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11264e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11274e2b4712SSatish Balay 
11284e2b4712SSatish Balay   /* forward solve the lower triangular */
112987828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11304e2b4712SSatish Balay   for (i=1; i<n; i++) {
11314e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11324e2b4712SSatish Balay     vi  = aj + ai[i];
11334e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1134f1af5d2fSBarry Smith     s = t + bs*i;
113587828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11364e2b4712SSatish Balay     while (nz--) {
1137f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11384e2b4712SSatish Balay       v += bs2;
11394e2b4712SSatish Balay     }
11404e2b4712SSatish Balay   }
11414e2b4712SSatish Balay   /* backward solve the upper triangular */
1142273d9f13SBarry Smith   ls = a->solve_work + A->n;
11434e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11444e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11454e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11464e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
114787828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
1152f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
115387828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11544e2b4712SSatish Balay   }
11554e2b4712SSatish Balay 
11564e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11574e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11581ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1160b0a32e0cSBarry Smith   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
11614e2b4712SSatish Balay   PetscFunctionReturn(0);
11624e2b4712SSatish Balay }
11634e2b4712SSatish Balay 
11644a2ae208SSatish Balay #undef __FUNCT__
11654a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1166dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11674e2b4712SSatish Balay {
11684e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11694e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
1170*6849ba73SBarry Smith   PetscErrorCode ierr;
1171*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
11724e2b4712SSatish Balay   int             *diag = a->diag;
11733f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
117487828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
117587828ca2SBarry Smith   PetscScalar     *x,*b,*t;
11764e2b4712SSatish Balay 
11774e2b4712SSatish Balay   PetscFunctionBegin;
11781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1180f1af5d2fSBarry Smith   t  = a->solve_work;
11814e2b4712SSatish Balay 
11824e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11834e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11844e2b4712SSatish Balay 
11854e2b4712SSatish Balay   /* forward solve the lower triangular */
11864e2b4712SSatish Balay   idx    = 7*(*r++);
1187f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1188f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1189f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11904e2b4712SSatish Balay 
11914e2b4712SSatish Balay   for (i=1; i<n; i++) {
11924e2b4712SSatish Balay     v     = aa + 49*ai[i];
11934e2b4712SSatish Balay     vi    = aj + ai[i];
11944e2b4712SSatish Balay     nz    = diag[i] - ai[i];
11954e2b4712SSatish Balay     idx   = 7*(*r++);
1196f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1197f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
11984e2b4712SSatish Balay     while (nz--) {
11994e2b4712SSatish Balay       idx   = 7*(*vi++);
1200f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1201f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1202f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1203f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1204f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1205f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1206f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1207f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1208f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1209f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12104e2b4712SSatish Balay       v += 49;
12114e2b4712SSatish Balay     }
12124e2b4712SSatish Balay     idx = 7*i;
1213f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1214f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1215f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12164e2b4712SSatish Balay   }
12174e2b4712SSatish Balay   /* backward solve the upper triangular */
12184e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12194e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12204e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12214e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12224e2b4712SSatish Balay     idt  = 7*i;
1223f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1224f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1225f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12264e2b4712SSatish Balay     while (nz--) {
12274e2b4712SSatish Balay       idx   = 7*(*vi++);
1228f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1229f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1230f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1231f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1232f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1233f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1234f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1235f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1236f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1237f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12384e2b4712SSatish Balay       v += 49;
12394e2b4712SSatish Balay     }
12404e2b4712SSatish Balay     idc = 7*(*c--);
12414e2b4712SSatish Balay     v   = aa + 49*diag[i];
1242f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1243f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1244f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1245f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1246f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1247f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1248f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1249f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1250f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1251f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1252f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1253f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1254f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1255f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12564e2b4712SSatish Balay   }
12574e2b4712SSatish Balay 
12584e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12594e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12601ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12611ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1262b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
12634e2b4712SSatish Balay   PetscFunctionReturn(0);
12644e2b4712SSatish Balay }
12654e2b4712SSatish Balay 
12664a2ae208SSatish Balay #undef __FUNCT__
12674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1268dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
126915091d37SBarry Smith {
127015091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
127115091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1272dfbe8321SBarry Smith   PetscErrorCode ierr;
1273dfbe8321SBarry Smith   int *diag = a->diag,jdx;
127415091d37SBarry Smith   MatScalar       *aa=a->a,*v;
127587828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
127615091d37SBarry Smith 
127715091d37SBarry Smith   PetscFunctionBegin;
12781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128015091d37SBarry Smith   /* forward solve the lower triangular */
128115091d37SBarry Smith   idx    = 0;
128215091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
128315091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
128415091d37SBarry Smith   x[6] = b[6+idx];
128515091d37SBarry Smith   for (i=1; i<n; i++) {
128615091d37SBarry Smith     v     =  aa + 49*ai[i];
128715091d37SBarry Smith     vi    =  aj + ai[i];
128815091d37SBarry Smith     nz    =  diag[i] - ai[i];
128915091d37SBarry Smith     idx   =  7*i;
1290f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1291f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1292f1af5d2fSBarry Smith     s7  =  b[6+idx];
129315091d37SBarry Smith     while (nz--) {
129415091d37SBarry Smith       jdx   = 7*(*vi++);
129515091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
129615091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
129715091d37SBarry Smith       x7    = x[6+jdx];
1298f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1299f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1300f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1301f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1302f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1303f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1304f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
130515091d37SBarry Smith       v += 49;
130615091d37SBarry Smith      }
1307f1af5d2fSBarry Smith     x[idx]   = s1;
1308f1af5d2fSBarry Smith     x[1+idx] = s2;
1309f1af5d2fSBarry Smith     x[2+idx] = s3;
1310f1af5d2fSBarry Smith     x[3+idx] = s4;
1311f1af5d2fSBarry Smith     x[4+idx] = s5;
1312f1af5d2fSBarry Smith     x[5+idx] = s6;
1313f1af5d2fSBarry Smith     x[6+idx] = s7;
131415091d37SBarry Smith   }
131515091d37SBarry Smith   /* backward solve the upper triangular */
131615091d37SBarry Smith   for (i=n-1; i>=0; i--){
131715091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
131815091d37SBarry Smith     vi   = aj + diag[i] + 1;
131915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
132015091d37SBarry Smith     idt  = 7*i;
1321f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1322f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1323f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1324f1af5d2fSBarry Smith     s7 = x[6+idt];
132515091d37SBarry Smith     while (nz--) {
132615091d37SBarry Smith       idx   = 7*(*vi++);
132715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
132815091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
132915091d37SBarry Smith       x7    = x[6+idx];
1330f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1331f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1332f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1333f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1334f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1335f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1336f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
133715091d37SBarry Smith       v += 49;
133815091d37SBarry Smith     }
133915091d37SBarry Smith     v        = aa + 49*diag[i];
1340f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1341f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1342f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1343f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1344f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1345f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1346f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1347f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1348f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1349f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1350f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1351f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1352f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1353f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
135415091d37SBarry Smith   }
135515091d37SBarry Smith 
13561ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1358b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
135915091d37SBarry Smith   PetscFunctionReturn(0);
136015091d37SBarry Smith }
136115091d37SBarry Smith 
13624a2ae208SSatish Balay #undef __FUNCT__
13634a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1364dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
136515091d37SBarry Smith {
136615091d37SBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
136715091d37SBarry Smith   IS              iscol=a->col,isrow=a->row;
1368*6849ba73SBarry Smith   PetscErrorCode ierr;
1369*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
137015091d37SBarry Smith   int             *diag = a->diag;
137115091d37SBarry Smith   MatScalar       *aa=a->a,*v;
137287828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
137315091d37SBarry Smith 
137415091d37SBarry Smith   PetscFunctionBegin;
13751ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
13761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1377f1af5d2fSBarry Smith   t  = a->solve_work;
137815091d37SBarry Smith 
137915091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
138015091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
138115091d37SBarry Smith 
138215091d37SBarry Smith   /* forward solve the lower triangular */
138315091d37SBarry Smith   idx    = 6*(*r++);
1384f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1385f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1386f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
138715091d37SBarry Smith   for (i=1; i<n; i++) {
138815091d37SBarry Smith     v     = aa + 36*ai[i];
138915091d37SBarry Smith     vi    = aj + ai[i];
139015091d37SBarry Smith     nz    = diag[i] - ai[i];
139115091d37SBarry Smith     idx   = 6*(*r++);
1392f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1393f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
139415091d37SBarry Smith     while (nz--) {
139515091d37SBarry Smith       idx   = 6*(*vi++);
1396f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1397f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1398f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1399f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1400f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1401f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1402f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1403f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
140415091d37SBarry Smith       v += 36;
140515091d37SBarry Smith     }
140615091d37SBarry Smith     idx = 6*i;
1407f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1408f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1409f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
141015091d37SBarry Smith   }
141115091d37SBarry Smith   /* backward solve the upper triangular */
141215091d37SBarry Smith   for (i=n-1; i>=0; i--){
141315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
141415091d37SBarry Smith     vi   = aj + diag[i] + 1;
141515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
141615091d37SBarry Smith     idt  = 6*i;
1417f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1418f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1419f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
142015091d37SBarry Smith     while (nz--) {
142115091d37SBarry Smith       idx   = 6*(*vi++);
1422f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1423f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1424f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1425f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1426f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1427f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1428f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1429f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1430f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
143115091d37SBarry Smith       v += 36;
143215091d37SBarry Smith     }
143315091d37SBarry Smith     idc = 6*(*c--);
143415091d37SBarry Smith     v   = aa + 36*diag[i];
1435f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1436f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1437f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1438f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1439f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1440f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1441f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1442f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1443f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1444f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1445f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1446f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
144715091d37SBarry Smith   }
144815091d37SBarry Smith 
144915091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
145015091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14511ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
14521ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1453b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
145415091d37SBarry Smith   PetscFunctionReturn(0);
145515091d37SBarry Smith }
145615091d37SBarry Smith 
14574a2ae208SSatish Balay #undef __FUNCT__
14584a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1459dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
146015091d37SBarry Smith {
146115091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
146215091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1463dfbe8321SBarry Smith   PetscErrorCode ierr;
1464dfbe8321SBarry Smith   int *diag = a->diag,jdx;
146515091d37SBarry Smith   MatScalar       *aa=a->a,*v;
146687828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
146715091d37SBarry Smith 
146815091d37SBarry Smith   PetscFunctionBegin;
14691ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
14701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
147115091d37SBarry Smith   /* forward solve the lower triangular */
147215091d37SBarry Smith   idx    = 0;
147315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
147415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
147515091d37SBarry Smith   for (i=1; i<n; i++) {
147615091d37SBarry Smith     v     =  aa + 36*ai[i];
147715091d37SBarry Smith     vi    =  aj + ai[i];
147815091d37SBarry Smith     nz    =  diag[i] - ai[i];
147915091d37SBarry Smith     idx   =  6*i;
1480f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1481f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
148215091d37SBarry Smith     while (nz--) {
148315091d37SBarry Smith       jdx   = 6*(*vi++);
148415091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
148515091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1486f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1487f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1488f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1489f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1490f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1491f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
149215091d37SBarry Smith       v += 36;
149315091d37SBarry Smith      }
1494f1af5d2fSBarry Smith     x[idx]   = s1;
1495f1af5d2fSBarry Smith     x[1+idx] = s2;
1496f1af5d2fSBarry Smith     x[2+idx] = s3;
1497f1af5d2fSBarry Smith     x[3+idx] = s4;
1498f1af5d2fSBarry Smith     x[4+idx] = s5;
1499f1af5d2fSBarry Smith     x[5+idx] = s6;
150015091d37SBarry Smith   }
150115091d37SBarry Smith   /* backward solve the upper triangular */
150215091d37SBarry Smith   for (i=n-1; i>=0; i--){
150315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
150415091d37SBarry Smith     vi   = aj + diag[i] + 1;
150515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
150615091d37SBarry Smith     idt  = 6*i;
1507f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1508f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1509f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
151015091d37SBarry Smith     while (nz--) {
151115091d37SBarry Smith       idx   = 6*(*vi++);
151215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
151315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1514f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1515f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1516f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1517f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1518f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1519f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
152015091d37SBarry Smith       v += 36;
152115091d37SBarry Smith     }
152215091d37SBarry Smith     v        = aa + 36*diag[i];
1523f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1524f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1525f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1526f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1527f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1528f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
152915091d37SBarry Smith   }
153015091d37SBarry Smith 
15311ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
15321ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1533b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
153415091d37SBarry Smith   PetscFunctionReturn(0);
153515091d37SBarry Smith }
153615091d37SBarry Smith 
15374a2ae208SSatish Balay #undef __FUNCT__
15384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1539dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15404e2b4712SSatish Balay {
15414e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15424e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
1543*6849ba73SBarry Smith   PetscErrorCode ierr;
1544*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
15454e2b4712SSatish Balay   int             *diag = a->diag;
15463f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
154787828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
15484e2b4712SSatish Balay 
15494e2b4712SSatish Balay   PetscFunctionBegin;
15501ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
15511ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1552f1af5d2fSBarry Smith   t  = a->solve_work;
15534e2b4712SSatish Balay 
15544e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15554e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15564e2b4712SSatish Balay 
15574e2b4712SSatish Balay   /* forward solve the lower triangular */
15584e2b4712SSatish Balay   idx    = 5*(*r++);
1559f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1560f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15614e2b4712SSatish Balay   for (i=1; i<n; i++) {
15624e2b4712SSatish Balay     v     = aa + 25*ai[i];
15634e2b4712SSatish Balay     vi    = aj + ai[i];
15644e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15654e2b4712SSatish Balay     idx   = 5*(*r++);
1566f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1567f1af5d2fSBarry Smith     s5  = b[4+idx];
15684e2b4712SSatish Balay     while (nz--) {
15694e2b4712SSatish Balay       idx   = 5*(*vi++);
1570f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1571f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1572f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1573f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1574f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1575f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1576f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15774e2b4712SSatish Balay       v += 25;
15784e2b4712SSatish Balay     }
15794e2b4712SSatish Balay     idx = 5*i;
1580f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1581f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15824e2b4712SSatish Balay   }
15834e2b4712SSatish Balay   /* backward solve the upper triangular */
15844e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15854e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15864e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15874e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
15884e2b4712SSatish Balay     idt  = 5*i;
1589f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1590f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
15914e2b4712SSatish Balay     while (nz--) {
15924e2b4712SSatish Balay       idx   = 5*(*vi++);
1593f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1594f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1595f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1596f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1597f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1598f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1599f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
16004e2b4712SSatish Balay       v += 25;
16014e2b4712SSatish Balay     }
16024e2b4712SSatish Balay     idc = 5*(*c--);
16034e2b4712SSatish Balay     v   = aa + 25*diag[i];
1604f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1605f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1606f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1607f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1608f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1609f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1610f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1611f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1612f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1613f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
16144e2b4712SSatish Balay   }
16154e2b4712SSatish Balay 
16164e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
16174e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16181ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
16191ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1620b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
16214e2b4712SSatish Balay   PetscFunctionReturn(0);
16224e2b4712SSatish Balay }
16234e2b4712SSatish Balay 
16244a2ae208SSatish Balay #undef __FUNCT__
16254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1626dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
162715091d37SBarry Smith {
162815091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
162915091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1630dfbe8321SBarry Smith   PetscErrorCode ierr;
1631dfbe8321SBarry Smith   int *diag = a->diag,jdx;
163215091d37SBarry Smith   MatScalar       *aa=a->a,*v;
163387828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
163415091d37SBarry Smith 
163515091d37SBarry Smith   PetscFunctionBegin;
16361ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
16371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
163815091d37SBarry Smith   /* forward solve the lower triangular */
163915091d37SBarry Smith   idx    = 0;
164015091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
164115091d37SBarry Smith   for (i=1; i<n; i++) {
164215091d37SBarry Smith     v     =  aa + 25*ai[i];
164315091d37SBarry Smith     vi    =  aj + ai[i];
164415091d37SBarry Smith     nz    =  diag[i] - ai[i];
164515091d37SBarry Smith     idx   =  5*i;
1646f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
164715091d37SBarry Smith     while (nz--) {
164815091d37SBarry Smith       jdx   = 5*(*vi++);
164915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1650f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1651f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1652f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1653f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1654f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
165515091d37SBarry Smith       v    += 25;
165615091d37SBarry Smith     }
1657f1af5d2fSBarry Smith     x[idx]   = s1;
1658f1af5d2fSBarry Smith     x[1+idx] = s2;
1659f1af5d2fSBarry Smith     x[2+idx] = s3;
1660f1af5d2fSBarry Smith     x[3+idx] = s4;
1661f1af5d2fSBarry Smith     x[4+idx] = s5;
166215091d37SBarry Smith   }
166315091d37SBarry Smith   /* backward solve the upper triangular */
166415091d37SBarry Smith   for (i=n-1; i>=0; i--){
166515091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
166615091d37SBarry Smith     vi   = aj + diag[i] + 1;
166715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
166815091d37SBarry Smith     idt  = 5*i;
1669f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1670f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
167115091d37SBarry Smith     while (nz--) {
167215091d37SBarry Smith       idx   = 5*(*vi++);
167315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1674f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1675f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1676f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1677f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1678f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
167915091d37SBarry Smith       v    += 25;
168015091d37SBarry Smith     }
168115091d37SBarry Smith     v        = aa + 25*diag[i];
1682f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1683f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1684f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1685f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1686f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
168715091d37SBarry Smith   }
168815091d37SBarry Smith 
16891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
16901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1691b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
169215091d37SBarry Smith   PetscFunctionReturn(0);
169315091d37SBarry Smith }
169415091d37SBarry Smith 
16954a2ae208SSatish Balay #undef __FUNCT__
16964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1697dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
16984e2b4712SSatish Balay {
16994e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
17004e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
1701*6849ba73SBarry Smith   PetscErrorCode ierr;
1702*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
17034e2b4712SSatish Balay   int             *diag = a->diag;
17043f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
170587828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
17064e2b4712SSatish Balay 
17074e2b4712SSatish Balay   PetscFunctionBegin;
17081ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
17091ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1710f1af5d2fSBarry Smith   t  = a->solve_work;
17114e2b4712SSatish Balay 
17124e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17134e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
17144e2b4712SSatish Balay 
17154e2b4712SSatish Balay   /* forward solve the lower triangular */
17164e2b4712SSatish Balay   idx    = 4*(*r++);
1717f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1718f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
17194e2b4712SSatish Balay   for (i=1; i<n; i++) {
17204e2b4712SSatish Balay     v     = aa + 16*ai[i];
17214e2b4712SSatish Balay     vi    = aj + ai[i];
17224e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17234e2b4712SSatish Balay     idx   = 4*(*r++);
1724f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17254e2b4712SSatish Balay     while (nz--) {
17264e2b4712SSatish Balay       idx   = 4*(*vi++);
1727f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1728f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1729f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1730f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1731f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17324e2b4712SSatish Balay       v    += 16;
17334e2b4712SSatish Balay     }
17344e2b4712SSatish Balay     idx        = 4*i;
1735f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1736f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17374e2b4712SSatish Balay   }
17384e2b4712SSatish Balay   /* backward solve the upper triangular */
17394e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17404e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17414e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17424e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17434e2b4712SSatish Balay     idt  = 4*i;
1744f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1745f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17464e2b4712SSatish Balay     while (nz--) {
17474e2b4712SSatish Balay       idx   = 4*(*vi++);
1748f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1749f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1750f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1751f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1752f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1753f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17544e2b4712SSatish Balay       v += 16;
17554e2b4712SSatish Balay     }
17564e2b4712SSatish Balay     idc      = 4*(*c--);
17574e2b4712SSatish Balay     v        = aa + 16*diag[i];
1758f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1759f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1760f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1761f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17624e2b4712SSatish Balay   }
17634e2b4712SSatish Balay 
17644e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17654e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
17661ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
17671ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1768b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
17694e2b4712SSatish Balay   PetscFunctionReturn(0);
17704e2b4712SSatish Balay }
1771f26ec98cSKris Buschelman 
1772f26ec98cSKris Buschelman #undef __FUNCT__
1773f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1774dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1775f26ec98cSKris Buschelman {
1776f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1777f26ec98cSKris Buschelman   IS              iscol=a->col,isrow=a->row;
1778*6849ba73SBarry Smith   PetscErrorCode ierr;
1779*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1780f26ec98cSKris Buschelman   int             *diag = a->diag;
1781f26ec98cSKris Buschelman   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1782f26ec98cSKris Buschelman   PetscScalar     *x,*b;
1783f26ec98cSKris Buschelman 
1784f26ec98cSKris Buschelman   PetscFunctionBegin;
17851ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
17861ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1787f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1788f26ec98cSKris Buschelman 
1789f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1790f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1791f26ec98cSKris Buschelman 
1792f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1793f26ec98cSKris Buschelman   idx    = 4*(*r++);
1794f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1795f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1796f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1797f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1798f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1799f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1800f26ec98cSKris Buschelman     vi    = aj + ai[i];
1801f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1802f26ec98cSKris Buschelman     idx   = 4*(*r++);
1803f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1804f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1805f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1806f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1807f26ec98cSKris Buschelman     while (nz--) {
1808f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1809f26ec98cSKris Buschelman       x1  = t[idx];
1810f26ec98cSKris Buschelman       x2  = t[1+idx];
1811f26ec98cSKris Buschelman       x3  = t[2+idx];
1812f26ec98cSKris Buschelman       x4  = t[3+idx];
1813f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1814f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1815f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1816f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1817f26ec98cSKris Buschelman       v    += 16;
1818f26ec98cSKris Buschelman     }
1819f26ec98cSKris Buschelman     idx        = 4*i;
1820f26ec98cSKris Buschelman     t[idx]   = s1;
1821f26ec98cSKris Buschelman     t[1+idx] = s2;
1822f26ec98cSKris Buschelman     t[2+idx] = s3;
1823f26ec98cSKris Buschelman     t[3+idx] = s4;
1824f26ec98cSKris Buschelman   }
1825f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1826f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1827f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1828f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1829f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1830f26ec98cSKris Buschelman     idt  = 4*i;
1831f26ec98cSKris Buschelman     s1 = t[idt];
1832f26ec98cSKris Buschelman     s2 = t[1+idt];
1833f26ec98cSKris Buschelman     s3 = t[2+idt];
1834f26ec98cSKris Buschelman     s4 = t[3+idt];
1835f26ec98cSKris Buschelman     while (nz--) {
1836f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1837f26ec98cSKris Buschelman       x1  = t[idx];
1838f26ec98cSKris Buschelman       x2  = t[1+idx];
1839f26ec98cSKris Buschelman       x3  = t[2+idx];
1840f26ec98cSKris Buschelman       x4  = t[3+idx];
1841f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1842f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1843f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1844f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1845f26ec98cSKris Buschelman       v += 16;
1846f26ec98cSKris Buschelman     }
1847f26ec98cSKris Buschelman     idc      = 4*(*c--);
1848f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1849f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1850f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1851f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1852f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1853f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1854f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1855f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1856f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1857f26ec98cSKris Buschelman  }
1858f26ec98cSKris Buschelman 
1859f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1860f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18611ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
18621ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1863f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1864f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1865f26ec98cSKris Buschelman }
1866f26ec98cSKris Buschelman 
186724c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
186824c233c2SKris Buschelman 
186924c233c2SKris Buschelman #include PETSC_HAVE_SSE
187024c233c2SKris Buschelman 
187124c233c2SKris Buschelman #undef __FUNCT__
187224c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1873dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
187424c233c2SKris Buschelman {
187524c233c2SKris Buschelman   /*
187624c233c2SKris Buschelman      Note: This code uses demotion of double
187724c233c2SKris Buschelman      to float when performing the mixed-mode computation.
187824c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
187924c233c2SKris Buschelman   */
188024c233c2SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
188124c233c2SKris Buschelman   IS              iscol=a->col,isrow=a->row;
1882*6849ba73SBarry Smith   PetscErrorCode ierr;
1883*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
188424c233c2SKris Buschelman   int             *diag = a->diag,ai16;
188524c233c2SKris Buschelman   MatScalar       *aa=a->a,*v;
188687828ca2SBarry Smith   PetscScalar     *x,*b,*t;
188724c233c2SKris Buschelman 
188824c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
188924c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
189024c233c2SKris Buschelman   unsigned long   offset;
189124c233c2SKris Buschelman 
189224c233c2SKris Buschelman   PetscFunctionBegin;
189324c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
189424c233c2SKris Buschelman 
189524c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
189624c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
189724c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
189824c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
189924c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
190024c233c2SKris Buschelman 
19011ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
19021ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
190324c233c2SKris Buschelman     t  = a->solve_work;
190424c233c2SKris Buschelman 
190524c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
190624c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
190724c233c2SKris Buschelman 
190824c233c2SKris Buschelman     /* forward solve the lower triangular */
190924c233c2SKris Buschelman     idx  = 4*(*r++);
191024c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
191124c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
191224c233c2SKris Buschelman     v    =  aa + 16*ai[1];
191324c233c2SKris Buschelman 
191424c233c2SKris Buschelman     for (i=1; i<n;) {
191524c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
191624c233c2SKris Buschelman       vi   =  aj      + ai[i];
191724c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
191824c233c2SKris Buschelman       idx  =  4*(*r++);
191924c233c2SKris Buschelman 
192024c233c2SKris Buschelman       /* Demote sum from double to float */
192124c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
192224c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
192324c233c2SKris Buschelman 
192424c233c2SKris Buschelman       while (nz--) {
192524c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
192624c233c2SKris Buschelman         idx = 4*(*vi++);
192724c233c2SKris Buschelman 
192824c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
192924c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
193024c233c2SKris Buschelman 
193124c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
193224c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
193324c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
193424c233c2SKris Buschelman 
193524c233c2SKris Buschelman           /* First Column */
193624c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
193724c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
193824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
193924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
194024c233c2SKris Buschelman 
194124c233c2SKris Buschelman           /* Second Column */
194224c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
194324c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
194424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
194524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
194624c233c2SKris Buschelman 
194724c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
194824c233c2SKris Buschelman 
194924c233c2SKris Buschelman           /* Third Column */
195024c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
195124c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
195224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
195324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
195424c233c2SKris Buschelman 
195524c233c2SKris Buschelman           /* Fourth Column */
195624c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
195724c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
195824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
195924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
196024c233c2SKris Buschelman         SSE_INLINE_END_2
196124c233c2SKris Buschelman 
196224c233c2SKris Buschelman         v  += 16;
196324c233c2SKris Buschelman       }
196424c233c2SKris Buschelman       idx = 4*i;
196524c233c2SKris Buschelman       v   = aa + 16*ai[++i];
196624c233c2SKris Buschelman       PREFETCH_NTA(v);
196724c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
196824c233c2SKris Buschelman 
196924c233c2SKris Buschelman       /* Promote result from float to double */
197024c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
197124c233c2SKris Buschelman     }
197224c233c2SKris Buschelman     /* backward solve the upper triangular */
197324c233c2SKris Buschelman     idt  = 4*(n-1);
197424c233c2SKris Buschelman     ai16 = 16*diag[n-1];
197524c233c2SKris Buschelman     v    = aa + ai16 + 16;
197624c233c2SKris Buschelman     for (i=n-1; i>=0;){
197724c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
197824c233c2SKris Buschelman       vi = aj + diag[i] + 1;
197924c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
198024c233c2SKris Buschelman 
198124c233c2SKris Buschelman       /* Demote accumulator from double to float */
198224c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
198324c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
198424c233c2SKris Buschelman 
198524c233c2SKris Buschelman       while (nz--) {
198624c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
198724c233c2SKris Buschelman         idx = 4*(*vi++);
198824c233c2SKris Buschelman 
198924c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
199024c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
199124c233c2SKris Buschelman 
199224c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
199324c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
199424c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
199524c233c2SKris Buschelman 
199624c233c2SKris Buschelman           /* First Column */
199724c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
199824c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
199924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
200024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
200124c233c2SKris Buschelman 
200224c233c2SKris Buschelman           /* Second Column */
200324c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
200424c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
200524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
200624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
200724c233c2SKris Buschelman 
200824c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
200924c233c2SKris Buschelman 
201024c233c2SKris Buschelman           /* Third Column */
201124c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
201224c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
201324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
201424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
201524c233c2SKris Buschelman 
201624c233c2SKris Buschelman           /* Fourth Column */
201724c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
201824c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
201924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
202024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
202124c233c2SKris Buschelman         SSE_INLINE_END_2
202224c233c2SKris Buschelman         v  += 16;
202324c233c2SKris Buschelman       }
202424c233c2SKris Buschelman       v    = aa + ai16;
202524c233c2SKris Buschelman       ai16 = 16*diag[--i];
202624c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
202724c233c2SKris Buschelman       /*
202824c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
202924c233c2SKris Buschelman          which was inverted as part of the factorization
203024c233c2SKris Buschelman       */
203124c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
203224c233c2SKris Buschelman         /* First Column */
203324c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
203424c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
203524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
203624c233c2SKris Buschelman 
203724c233c2SKris Buschelman         /* Second Column */
203824c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
203924c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
204024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
204124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
204224c233c2SKris Buschelman 
204324c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
204424c233c2SKris Buschelman 
204524c233c2SKris Buschelman         /* Third Column */
204624c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
204724c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
204824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
204924c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
205024c233c2SKris Buschelman 
205124c233c2SKris Buschelman         /* Fourth Column */
205224c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
205324c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
205424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
205524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
205624c233c2SKris Buschelman 
205724c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
205824c233c2SKris Buschelman       SSE_INLINE_END_3
205924c233c2SKris Buschelman 
206024c233c2SKris Buschelman       /* Promote solution from float to double */
206124c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
206224c233c2SKris Buschelman 
206324c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
206424c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
206524c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
206624c233c2SKris Buschelman       idc  = 4*(*c--);
206724c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
206824c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
206924c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
207024c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
207124c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
207224c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
207324c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
207424c233c2SKris Buschelman       SSE_INLINE_END_2
207524c233c2SKris Buschelman       v    = aa + ai16 + 16;
207624c233c2SKris Buschelman       idt -= 4;
207724c233c2SKris Buschelman     }
207824c233c2SKris Buschelman 
207924c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
208024c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20811ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
20821ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
208324c233c2SKris Buschelman     PetscLogFlops(2*16*(a->nz) - 4*A->n);
208424c233c2SKris Buschelman   SSE_SCOPE_END;
208524c233c2SKris Buschelman   PetscFunctionReturn(0);
208624c233c2SKris Buschelman }
208724c233c2SKris Buschelman 
208824c233c2SKris Buschelman #endif
20890ef38995SBarry Smith 
20900ef38995SBarry Smith 
20914e2b4712SSatish Balay /*
20924e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
20934e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
20944e2b4712SSatish Balay */
20954a2ae208SSatish Balay #undef __FUNCT__
20964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2097dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
20984e2b4712SSatish Balay {
20994e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
210030d4dcafSBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
2101dfbe8321SBarry Smith   PetscErrorCode ierr;
2102dfbe8321SBarry Smith   int  *diag = a->diag;
21033f1db9ecSBarry Smith   MatScalar       *aa=a->a;
210487828ca2SBarry Smith   PetscScalar     *x,*b;
21054e2b4712SSatish Balay 
21064e2b4712SSatish Balay   PetscFunctionBegin;
21071ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
21081ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21094e2b4712SSatish Balay 
2110aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
21112853dc0eSBarry Smith   {
211287828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
21132853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
21142853dc0eSBarry Smith   }
2115aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
21162853dc0eSBarry Smith   {
211787828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
21182853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
21192853dc0eSBarry Smith   }
2120aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
21212853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2122e1293385SBarry Smith #else
212330d4dcafSBarry Smith   {
212487828ca2SBarry Smith     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
21253f1db9ecSBarry Smith     MatScalar    *v;
21264e555682SBarry Smith     int          jdx,idt,idx,nz,*vi,i,ai16;
2127e1293385SBarry Smith 
21284e2b4712SSatish Balay   /* forward solve the lower triangular */
21294e2b4712SSatish Balay   idx    = 0;
2130e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
21314e2b4712SSatish Balay   for (i=1; i<n; i++) {
21324e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
21334e2b4712SSatish Balay     vi    =  aj      + ai[i];
21344e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2135e1293385SBarry Smith     idx   +=  4;
2136f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
21374e2b4712SSatish Balay     while (nz--) {
21384e2b4712SSatish Balay       jdx   = 4*(*vi++);
21394e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2140f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2141f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2142f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2143f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
21444e2b4712SSatish Balay       v    += 16;
21454e2b4712SSatish Balay     }
2146f1af5d2fSBarry Smith     x[idx]   = s1;
2147f1af5d2fSBarry Smith     x[1+idx] = s2;
2148f1af5d2fSBarry Smith     x[2+idx] = s3;
2149f1af5d2fSBarry Smith     x[3+idx] = s4;
21504e2b4712SSatish Balay   }
21514e2b4712SSatish Balay   /* backward solve the upper triangular */
21524e555682SBarry Smith   idt = 4*(n-1);
21534e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21544e555682SBarry Smith     ai16 = 16*diag[i];
21554e555682SBarry Smith     v    = aa + ai16 + 16;
21564e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21574e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2158f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2159f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
21604e2b4712SSatish Balay     while (nz--) {
21614e2b4712SSatish Balay       idx   = 4*(*vi++);
21624e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2163f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2164f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2165f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2166f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
21674e2b4712SSatish Balay       v    += 16;
21684e2b4712SSatish Balay     }
21694e555682SBarry Smith     v        = aa + ai16;
2170f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2171f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2172f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2173f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2174329f5518SBarry Smith     idt -= 4;
21754e2b4712SSatish Balay   }
217630d4dcafSBarry Smith   }
2177e1293385SBarry Smith #endif
21784e2b4712SSatish Balay 
21791ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
21801ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2181b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
21824e2b4712SSatish Balay   PetscFunctionReturn(0);
21834e2b4712SSatish Balay }
21844e2b4712SSatish Balay 
2185f26ec98cSKris Buschelman #undef __FUNCT__
2186f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2187dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2188f26ec98cSKris Buschelman {
2189f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2190f26ec98cSKris Buschelman   int             n=a->mbs,*ai=a->i,*aj=a->j;
2191dfbe8321SBarry Smith   PetscErrorCode ierr;
2192dfbe8321SBarry Smith   int  *diag = a->diag;
2193f26ec98cSKris Buschelman   MatScalar       *aa=a->a;
2194f26ec98cSKris Buschelman   PetscScalar     *x,*b;
2195f26ec98cSKris Buschelman 
2196f26ec98cSKris Buschelman   PetscFunctionBegin;
21971ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
21981ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2199f26ec98cSKris Buschelman 
2200f26ec98cSKris Buschelman   {
2201f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2202f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2203f26ec98cSKris Buschelman     int        jdx,idt,idx,nz,*vi,i,ai16;
2204f26ec98cSKris Buschelman 
2205f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2206f26ec98cSKris Buschelman     idx  = 0;
2207f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2208f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2209f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2210f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2211f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2212f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2213f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2214f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2215f26ec98cSKris Buschelman       idx   +=  4;
2216f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2217f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2218f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2219f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2220f26ec98cSKris Buschelman       while (nz--) {
2221f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2222f26ec98cSKris Buschelman         x1  = t[jdx];
2223f26ec98cSKris Buschelman         x2  = t[1+jdx];
2224f26ec98cSKris Buschelman         x3  = t[2+jdx];
2225f26ec98cSKris Buschelman         x4  = t[3+jdx];
2226f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2227f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2228f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2229f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2230f26ec98cSKris Buschelman         v    += 16;
2231f26ec98cSKris Buschelman       }
2232f26ec98cSKris Buschelman       t[idx]   = s1;
2233f26ec98cSKris Buschelman       t[1+idx] = s2;
2234f26ec98cSKris Buschelman       t[2+idx] = s3;
2235f26ec98cSKris Buschelman       t[3+idx] = s4;
2236f26ec98cSKris Buschelman     }
2237f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2238f26ec98cSKris Buschelman     idt = 4*(n-1);
2239f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2240f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2241f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2242f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2243f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2244f26ec98cSKris Buschelman       s1   = t[idt];
2245f26ec98cSKris Buschelman       s2   = t[1+idt];
2246f26ec98cSKris Buschelman       s3   = t[2+idt];
2247f26ec98cSKris Buschelman       s4   = t[3+idt];
2248f26ec98cSKris Buschelman       while (nz--) {
2249f26ec98cSKris Buschelman         idx = 4*(*vi++);
2250f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2251f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2252f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2253f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2254f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2255f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2256f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2257f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2258f26ec98cSKris Buschelman         v    += 16;
2259f26ec98cSKris Buschelman       }
2260f26ec98cSKris Buschelman       v        = aa + ai16;
2261f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2262f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2263f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2264f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2265f26ec98cSKris Buschelman       idt -= 4;
2266f26ec98cSKris Buschelman     }
2267f26ec98cSKris Buschelman   }
2268f26ec98cSKris Buschelman 
22691ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
22701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2271f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2272f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2273f26ec98cSKris Buschelman }
2274f26ec98cSKris Buschelman 
22753660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
22763660e330SKris Buschelman 
22773660e330SKris Buschelman #include PETSC_HAVE_SSE
22783660e330SKris Buschelman #undef __FUNCT__
22797cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2280dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
22813660e330SKris Buschelman {
22823660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
22832aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2284dfbe8321SBarry Smith   PetscErrorCode ierr;
2285dfbe8321SBarry Smith   int  *ai=a->i,n=a->mbs,*diag = a->diag;
22863660e330SKris Buschelman   MatScalar      *aa=a->a;
228787828ca2SBarry Smith   PetscScalar    *x,*b;
22883660e330SKris Buschelman 
22893660e330SKris Buschelman   PetscFunctionBegin;
22903660e330SKris Buschelman   SSE_SCOPE_BEGIN;
22913660e330SKris Buschelman   /*
22923660e330SKris Buschelman      Note: This code currently uses demotion of double
22933660e330SKris Buschelman      to float when performing the mixed-mode computation.
22943660e330SKris Buschelman      This may not be numerically reasonable for all applications.
22953660e330SKris Buschelman   */
22963660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
22973660e330SKris Buschelman 
22981ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22991ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23003660e330SKris Buschelman   {
2301eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2302eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
23032aa5897fSKris Buschelman     int            nz,i,idt,ai16;
23042aa5897fSKris Buschelman     unsigned int   jdx,idx;
23052aa5897fSKris Buschelman     unsigned short *vi;
2306eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
23073660e330SKris Buschelman 
2308eb05f457SKris Buschelman     /* First block is the identity. */
23093660e330SKris Buschelman     idx  = 0;
2310eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
23112aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
23123660e330SKris Buschelman 
23133660e330SKris Buschelman     for (i=1; i<n;) {
23143660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23153660e330SKris Buschelman       vi   =  aj      + ai[i];
23163660e330SKris Buschelman       nz   =  diag[i] - ai[i];
23173660e330SKris Buschelman       idx +=  4;
23183660e330SKris Buschelman 
2319eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2320eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2321eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
23223660e330SKris Buschelman 
23233660e330SKris Buschelman       while (nz--) {
23243660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23252aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
23263660e330SKris Buschelman 
23273660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2328eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
23293660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23303660e330SKris Buschelman 
23313660e330SKris Buschelman           /* First Column */
23323660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23333660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23343660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23353660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23363660e330SKris Buschelman 
23373660e330SKris Buschelman           /* Second Column */
23383660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23393660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23403660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23413660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23423660e330SKris Buschelman 
23433660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23443660e330SKris Buschelman 
23453660e330SKris Buschelman           /* Third Column */
23463660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23473660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23483660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23493660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23503660e330SKris Buschelman 
23513660e330SKris Buschelman           /* Fourth Column */
23523660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23533660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23543660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23553660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23563660e330SKris Buschelman         SSE_INLINE_END_2
23573660e330SKris Buschelman 
23583660e330SKris Buschelman         v  += 16;
23593660e330SKris Buschelman       }
23603660e330SKris Buschelman       v    =  aa + 16*ai[++i];
23613660e330SKris Buschelman       PREFETCH_NTA(v);
2362eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
23633660e330SKris Buschelman     }
2364eb05f457SKris Buschelman 
2365eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2366eb05f457SKris Buschelman 
23673660e330SKris Buschelman     idt  = 4*(n-1);
23683660e330SKris Buschelman     ai16 = 16*diag[n-1];
23693660e330SKris Buschelman     v    = aa + ai16 + 16;
23703660e330SKris Buschelman     for (i=n-1; i>=0;){
23713660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23723660e330SKris Buschelman       vi = aj + diag[i] + 1;
23733660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
23743660e330SKris Buschelman 
2375eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
23763660e330SKris Buschelman 
23773660e330SKris Buschelman       while (nz--) {
23783660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23792aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
23803660e330SKris Buschelman 
23813660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2382eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
23833660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23843660e330SKris Buschelman 
23853660e330SKris Buschelman           /* First Column */
23863660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23873660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23883660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23893660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23903660e330SKris Buschelman 
23913660e330SKris Buschelman           /* Second Column */
23923660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23933660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23943660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23953660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23963660e330SKris Buschelman 
23973660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23983660e330SKris Buschelman 
23993660e330SKris Buschelman           /* Third Column */
24003660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24013660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24023660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24033660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24043660e330SKris Buschelman 
24053660e330SKris Buschelman           /* Fourth Column */
24063660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
24073660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
24083660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
24093660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
24103660e330SKris Buschelman         SSE_INLINE_END_2
24113660e330SKris Buschelman         v  += 16;
24123660e330SKris Buschelman       }
24133660e330SKris Buschelman       v    = aa + ai16;
24143660e330SKris Buschelman       ai16 = 16*diag[--i];
24153660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
24163660e330SKris Buschelman       /*
24173660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
24183660e330SKris Buschelman          which was inverted as part of the factorization
24193660e330SKris Buschelman       */
2420eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
24213660e330SKris Buschelman         /* First Column */
24223660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
24233660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
24243660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
24253660e330SKris Buschelman 
24263660e330SKris Buschelman         /* Second Column */
24273660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
24283660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
24293660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
24303660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
24313660e330SKris Buschelman 
24323660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
24333660e330SKris Buschelman 
24343660e330SKris Buschelman         /* Third Column */
24353660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
24363660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
24373660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
24383660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
24393660e330SKris Buschelman 
24403660e330SKris Buschelman         /* Fourth Column */
24413660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
24423660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
24433660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
24443660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
24453660e330SKris Buschelman 
24463660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
24473660e330SKris Buschelman       SSE_INLINE_END_3
24483660e330SKris Buschelman 
24493660e330SKris Buschelman       v    = aa + ai16 + 16;
24503660e330SKris Buschelman       idt -= 4;
24513660e330SKris Buschelman     }
2452eb05f457SKris Buschelman 
2453eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2454eb05f457SKris Buschelman     idt = 4*(n-1);
2455eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2456eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2457eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2458eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2459eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2460eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2461eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2462eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2463eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
246454693613SKris Buschelman       idt -= 4;
24653660e330SKris Buschelman     }
2466eb05f457SKris Buschelman 
2467eb05f457SKris Buschelman   } /* End of artificial scope. */
24681ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
24691ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24703660e330SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
24713660e330SKris Buschelman   SSE_SCOPE_END;
24723660e330SKris Buschelman   PetscFunctionReturn(0);
24733660e330SKris Buschelman }
24743660e330SKris Buschelman 
24757cf1b8d3SKris Buschelman #undef __FUNCT__
24767cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2477dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
24787cf1b8d3SKris Buschelman {
24797cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
24807cf1b8d3SKris Buschelman   int            *aj=a->j;
2481dfbe8321SBarry Smith   PetscErrorCode ierr;
2482dfbe8321SBarry Smith   int *ai=a->i,n=a->mbs,*diag = a->diag;
24837cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
24847cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
24857cf1b8d3SKris Buschelman 
24867cf1b8d3SKris Buschelman   PetscFunctionBegin;
24877cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
24887cf1b8d3SKris Buschelman   /*
24897cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
24907cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
24917cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
24927cf1b8d3SKris Buschelman   */
24937cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
24947cf1b8d3SKris Buschelman 
24951ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24961ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24977cf1b8d3SKris Buschelman   {
24987cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
24997cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
25007cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
25017cf1b8d3SKris Buschelman     int       jdx,idx;
25027cf1b8d3SKris Buschelman     int       *vi;
25037cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
25047cf1b8d3SKris Buschelman 
25057cf1b8d3SKris Buschelman     /* First block is the identity. */
25067cf1b8d3SKris Buschelman     idx  = 0;
25077cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
25087cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
25097cf1b8d3SKris Buschelman 
25107cf1b8d3SKris Buschelman     for (i=1; i<n;) {
25117cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25127cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
25137cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
25147cf1b8d3SKris Buschelman       idx +=  4;
25157cf1b8d3SKris Buschelman 
25167cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
25177cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
25187cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
25197cf1b8d3SKris Buschelman 
25207cf1b8d3SKris Buschelman       while (nz--) {
25217cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25227cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
25237cf1b8d3SKris Buschelman /*          jdx = *vi++; */
25247cf1b8d3SKris Buschelman 
25257cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
25267cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
25277cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25287cf1b8d3SKris Buschelman 
25297cf1b8d3SKris Buschelman           /* First Column */
25307cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25317cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25327cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25337cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25347cf1b8d3SKris Buschelman 
25357cf1b8d3SKris Buschelman           /* Second Column */
25367cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25377cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25387cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25397cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25407cf1b8d3SKris Buschelman 
25417cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25427cf1b8d3SKris Buschelman 
25437cf1b8d3SKris Buschelman           /* Third Column */
25447cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25457cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25467cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25477cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25487cf1b8d3SKris Buschelman 
25497cf1b8d3SKris Buschelman           /* Fourth Column */
25507cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25517cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25527cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25537cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25547cf1b8d3SKris Buschelman         SSE_INLINE_END_2
25557cf1b8d3SKris Buschelman 
25567cf1b8d3SKris Buschelman         v  += 16;
25577cf1b8d3SKris Buschelman       }
25587cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
25597cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
25607cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
25617cf1b8d3SKris Buschelman     }
25627cf1b8d3SKris Buschelman 
25637cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
25647cf1b8d3SKris Buschelman 
25657cf1b8d3SKris Buschelman     idt  = 4*(n-1);
25667cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
25677cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
25687cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
25697cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25707cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
25717cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
25727cf1b8d3SKris Buschelman 
25737cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
25747cf1b8d3SKris Buschelman 
25757cf1b8d3SKris Buschelman       while (nz--) {
25767cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25777cf1b8d3SKris Buschelman         idx = 4*(*vi++);
25787cf1b8d3SKris Buschelman /*          idx = *vi++; */
25797cf1b8d3SKris Buschelman 
25807cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
25817cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
25827cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25837cf1b8d3SKris Buschelman 
25847cf1b8d3SKris Buschelman           /* First Column */
25857cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25867cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25877cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25887cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25897cf1b8d3SKris Buschelman 
25907cf1b8d3SKris Buschelman           /* Second Column */
25917cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25927cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25937cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25947cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25957cf1b8d3SKris Buschelman 
25967cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25977cf1b8d3SKris Buschelman 
25987cf1b8d3SKris Buschelman           /* Third Column */
25997cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26007cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26017cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26027cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26037cf1b8d3SKris Buschelman 
26047cf1b8d3SKris Buschelman           /* Fourth Column */
26057cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26067cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26077cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26087cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26097cf1b8d3SKris Buschelman         SSE_INLINE_END_2
26107cf1b8d3SKris Buschelman         v  += 16;
26117cf1b8d3SKris Buschelman       }
26127cf1b8d3SKris Buschelman       v    = aa + ai16;
26137cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
26147cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
26157cf1b8d3SKris Buschelman       /*
26167cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
26177cf1b8d3SKris Buschelman          which was inverted as part of the factorization
26187cf1b8d3SKris Buschelman       */
26197cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
26207cf1b8d3SKris Buschelman         /* First Column */
26217cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
26227cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
26237cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
26247cf1b8d3SKris Buschelman 
26257cf1b8d3SKris Buschelman         /* Second Column */
26267cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
26277cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
26287cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
26297cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
26307cf1b8d3SKris Buschelman 
26317cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
26327cf1b8d3SKris Buschelman 
26337cf1b8d3SKris Buschelman         /* Third Column */
26347cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
26357cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
26367cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
26377cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
26387cf1b8d3SKris Buschelman 
26397cf1b8d3SKris Buschelman         /* Fourth Column */
26407cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
26417cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
26427cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
26437cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
26447cf1b8d3SKris Buschelman 
26457cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
26467cf1b8d3SKris Buschelman       SSE_INLINE_END_3
26477cf1b8d3SKris Buschelman 
26487cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
26497cf1b8d3SKris Buschelman       idt -= 4;
26507cf1b8d3SKris Buschelman     }
26517cf1b8d3SKris Buschelman 
26527cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
26537cf1b8d3SKris Buschelman     idt = 4*(n-1);
26547cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
26557cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
26567cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
26577cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
26587cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
26597cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
26607cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
26617cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
26627cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
26637cf1b8d3SKris Buschelman       idt -= 4;
26647cf1b8d3SKris Buschelman     }
26657cf1b8d3SKris Buschelman 
26667cf1b8d3SKris Buschelman   } /* End of artificial scope. */
26671ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
26681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
26697cf1b8d3SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
26707cf1b8d3SKris Buschelman   SSE_SCOPE_END;
26717cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
26727cf1b8d3SKris Buschelman }
26737cf1b8d3SKris Buschelman 
26743660e330SKris Buschelman #endif
26754a2ae208SSatish Balay #undef __FUNCT__
26764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2677dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
26784e2b4712SSatish Balay {
26794e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
26804e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
2681*6849ba73SBarry Smith   PetscErrorCode ierr;
2682*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
26834e2b4712SSatish Balay   int             *diag = a->diag;
26843f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
268587828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
26864e2b4712SSatish Balay 
26874e2b4712SSatish Balay   PetscFunctionBegin;
26881ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2690f1af5d2fSBarry Smith   t  = a->solve_work;
26914e2b4712SSatish Balay 
26924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
26934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26944e2b4712SSatish Balay 
26954e2b4712SSatish Balay   /* forward solve the lower triangular */
26964e2b4712SSatish Balay   idx    = 3*(*r++);
2697f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
26984e2b4712SSatish Balay   for (i=1; i<n; i++) {
26994e2b4712SSatish Balay     v     = aa + 9*ai[i];
27004e2b4712SSatish Balay     vi    = aj + ai[i];
27014e2b4712SSatish Balay     nz    = diag[i] - ai[i];
27024e2b4712SSatish Balay     idx   = 3*(*r++);
2703f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
27044e2b4712SSatish Balay     while (nz--) {
27054e2b4712SSatish Balay       idx   = 3*(*vi++);
2706f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2707f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2708f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2709f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27104e2b4712SSatish Balay       v += 9;
27114e2b4712SSatish Balay     }
27124e2b4712SSatish Balay     idx = 3*i;
2713f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
27144e2b4712SSatish Balay   }
27154e2b4712SSatish Balay   /* backward solve the upper triangular */
27164e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
27174e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
27184e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
27194e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
27204e2b4712SSatish Balay     idt  = 3*i;
2721f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
27224e2b4712SSatish Balay     while (nz--) {
27234e2b4712SSatish Balay       idx   = 3*(*vi++);
2724f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2725f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2726f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2727f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27284e2b4712SSatish Balay       v += 9;
27294e2b4712SSatish Balay     }
27304e2b4712SSatish Balay     idc = 3*(*c--);
27314e2b4712SSatish Balay     v   = aa + 9*diag[i];
2732f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2733f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2734f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
27354e2b4712SSatish Balay   }
27364e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27374e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
27391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2740b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
27414e2b4712SSatish Balay   PetscFunctionReturn(0);
27424e2b4712SSatish Balay }
27434e2b4712SSatish Balay 
274415091d37SBarry Smith /*
274515091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
274615091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
274715091d37SBarry Smith */
27484a2ae208SSatish Balay #undef __FUNCT__
27494a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2750dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
275115091d37SBarry Smith {
275215091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
275315091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
2754dfbe8321SBarry Smith   PetscErrorCode ierr;
2755dfbe8321SBarry Smith   int *diag = a->diag;
275615091d37SBarry Smith   MatScalar       *aa=a->a,*v;
275787828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
275815091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
275915091d37SBarry Smith 
276015091d37SBarry Smith   PetscFunctionBegin;
27611ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27621ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
276315091d37SBarry Smith 
276415091d37SBarry Smith 
276515091d37SBarry Smith   /* forward solve the lower triangular */
276615091d37SBarry Smith   idx    = 0;
276715091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
276815091d37SBarry Smith   for (i=1; i<n; i++) {
276915091d37SBarry Smith     v     =  aa      + 9*ai[i];
277015091d37SBarry Smith     vi    =  aj      + ai[i];
277115091d37SBarry Smith     nz    =  diag[i] - ai[i];
277215091d37SBarry Smith     idx   +=  3;
2773f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
277415091d37SBarry Smith     while (nz--) {
277515091d37SBarry Smith       jdx   = 3*(*vi++);
277615091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2777f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2778f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2779f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
278015091d37SBarry Smith       v    += 9;
278115091d37SBarry Smith     }
2782f1af5d2fSBarry Smith     x[idx]   = s1;
2783f1af5d2fSBarry Smith     x[1+idx] = s2;
2784f1af5d2fSBarry Smith     x[2+idx] = s3;
278515091d37SBarry Smith   }
278615091d37SBarry Smith   /* backward solve the upper triangular */
278715091d37SBarry Smith   for (i=n-1; i>=0; i--){
278815091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
278915091d37SBarry Smith     vi   = aj + diag[i] + 1;
279015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
279115091d37SBarry Smith     idt  = 3*i;
2792f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2793f1af5d2fSBarry Smith     s3 = x[2+idt];
279415091d37SBarry Smith     while (nz--) {
279515091d37SBarry Smith       idx   = 3*(*vi++);
279615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2797f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2798f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2799f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
280015091d37SBarry Smith       v    += 9;
280115091d37SBarry Smith     }
280215091d37SBarry Smith     v        = aa +  9*diag[i];
2803f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2804f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2805f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
280615091d37SBarry Smith   }
280715091d37SBarry Smith 
28081ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
28091ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2810b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
281115091d37SBarry Smith   PetscFunctionReturn(0);
281215091d37SBarry Smith }
281315091d37SBarry Smith 
28144a2ae208SSatish Balay #undef __FUNCT__
28154a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2816dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
28174e2b4712SSatish Balay {
28184e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
28194e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
2820*6849ba73SBarry Smith   PetscErrorCode ierr;
2821*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
28224e2b4712SSatish Balay   int             *diag = a->diag;
28233f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
282487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
28254e2b4712SSatish Balay 
28264e2b4712SSatish Balay   PetscFunctionBegin;
28271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
28281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2829f1af5d2fSBarry Smith   t  = a->solve_work;
28304e2b4712SSatish Balay 
28314e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28324e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28334e2b4712SSatish Balay 
28344e2b4712SSatish Balay   /* forward solve the lower triangular */
28354e2b4712SSatish Balay   idx    = 2*(*r++);
2836f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
28374e2b4712SSatish Balay   for (i=1; i<n; i++) {
28384e2b4712SSatish Balay     v     = aa + 4*ai[i];
28394e2b4712SSatish Balay     vi    = aj + ai[i];
28404e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28414e2b4712SSatish Balay     idx   = 2*(*r++);
2842f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
28434e2b4712SSatish Balay     while (nz--) {
28444e2b4712SSatish Balay       idx   = 2*(*vi++);
2845f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2846f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2847f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28484e2b4712SSatish Balay       v += 4;
28494e2b4712SSatish Balay     }
28504e2b4712SSatish Balay     idx = 2*i;
2851f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
28524e2b4712SSatish Balay   }
28534e2b4712SSatish Balay   /* backward solve the upper triangular */
28544e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28554e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
28564e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28574e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28584e2b4712SSatish Balay     idt  = 2*i;
2859f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
28604e2b4712SSatish Balay     while (nz--) {
28614e2b4712SSatish Balay       idx   = 2*(*vi++);
2862f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2863f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2864f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28654e2b4712SSatish Balay       v += 4;
28664e2b4712SSatish Balay     }
28674e2b4712SSatish Balay     idc = 2*(*c--);
28684e2b4712SSatish Balay     v   = aa + 4*diag[i];
2869f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2870f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
28714e2b4712SSatish Balay   }
28724e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28734e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
28741ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
28751ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2876b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
28774e2b4712SSatish Balay   PetscFunctionReturn(0);
28784e2b4712SSatish Balay }
28794e2b4712SSatish Balay 
288015091d37SBarry Smith /*
288115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
288215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
288315091d37SBarry Smith */
28844a2ae208SSatish Balay #undef __FUNCT__
28854a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2886dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
288715091d37SBarry Smith {
288815091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
288915091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
2890dfbe8321SBarry Smith   PetscErrorCode ierr;
2891dfbe8321SBarry Smith   int  *diag = a->diag;
289215091d37SBarry Smith   MatScalar       *aa=a->a,*v;
289387828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2;
289415091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
289515091d37SBarry Smith 
289615091d37SBarry Smith   PetscFunctionBegin;
28971ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
28981ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
289915091d37SBarry Smith 
290015091d37SBarry Smith   /* forward solve the lower triangular */
290115091d37SBarry Smith   idx    = 0;
290215091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
290315091d37SBarry Smith   for (i=1; i<n; i++) {
290415091d37SBarry Smith     v     =  aa      + 4*ai[i];
290515091d37SBarry Smith     vi    =  aj      + ai[i];
290615091d37SBarry Smith     nz    =  diag[i] - ai[i];
290715091d37SBarry Smith     idx   +=  2;
2908f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
290915091d37SBarry Smith     while (nz--) {
291015091d37SBarry Smith       jdx   = 2*(*vi++);
291115091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2912f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2913f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
291415091d37SBarry Smith       v    += 4;
291515091d37SBarry Smith     }
2916f1af5d2fSBarry Smith     x[idx]   = s1;
2917f1af5d2fSBarry Smith     x[1+idx] = s2;
291815091d37SBarry Smith   }
291915091d37SBarry Smith   /* backward solve the upper triangular */
292015091d37SBarry Smith   for (i=n-1; i>=0; i--){
292115091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
292215091d37SBarry Smith     vi   = aj + diag[i] + 1;
292315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
292415091d37SBarry Smith     idt  = 2*i;
2925f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
292615091d37SBarry Smith     while (nz--) {
292715091d37SBarry Smith       idx   = 2*(*vi++);
292815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2929f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2930f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
293115091d37SBarry Smith       v    += 4;
293215091d37SBarry Smith     }
293315091d37SBarry Smith     v        = aa +  4*diag[i];
2934f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2935f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
293615091d37SBarry Smith   }
293715091d37SBarry Smith 
29381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
29391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2940b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
294115091d37SBarry Smith   PetscFunctionReturn(0);
294215091d37SBarry Smith }
294315091d37SBarry Smith 
29444a2ae208SSatish Balay #undef __FUNCT__
29454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2946dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
29474e2b4712SSatish Balay {
29484e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
29494e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
2950*6849ba73SBarry Smith   PetscErrorCode ierr;
2951*6849ba73SBarry Smith   int             *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
29524e2b4712SSatish Balay   int             *diag = a->diag;
29533f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
295487828ca2SBarry Smith   PetscScalar     *x,*b,s1,*t;
29554e2b4712SSatish Balay 
29564e2b4712SSatish Balay   PetscFunctionBegin;
29574e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
29584e2b4712SSatish Balay 
29591ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
29601ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2961f1af5d2fSBarry Smith   t  = a->solve_work;
29624e2b4712SSatish Balay 
29634e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29644e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29654e2b4712SSatish Balay 
29664e2b4712SSatish Balay   /* forward solve the lower triangular */
2967f1af5d2fSBarry Smith   t[0] = b[*r++];
29684e2b4712SSatish Balay   for (i=1; i<n; i++) {
29694e2b4712SSatish Balay     v     = aa + ai[i];
29704e2b4712SSatish Balay     vi    = aj + ai[i];
29714e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2972f1af5d2fSBarry Smith     s1  = b[*r++];
29734e2b4712SSatish Balay     while (nz--) {
2974f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29754e2b4712SSatish Balay     }
2976f1af5d2fSBarry Smith     t[i] = s1;
29774e2b4712SSatish Balay   }
29784e2b4712SSatish Balay   /* backward solve the upper triangular */
29794e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
29804e2b4712SSatish Balay     v    = aa + diag[i] + 1;
29814e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
29824e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2983f1af5d2fSBarry Smith     s1 = t[i];
29844e2b4712SSatish Balay     while (nz--) {
2985f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29864e2b4712SSatish Balay     }
2987f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
29884e2b4712SSatish Balay   }
29894e2b4712SSatish Balay 
29904e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
29914e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
29921ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
29931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2994b0a32e0cSBarry Smith   PetscLogFlops(2*1*(a->nz) - A->n);
29954e2b4712SSatish Balay   PetscFunctionReturn(0);
29964e2b4712SSatish Balay }
299715091d37SBarry Smith /*
299815091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
299915091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
300015091d37SBarry Smith */
30014a2ae208SSatish Balay #undef __FUNCT__
30024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3003dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
300415091d37SBarry Smith {
300515091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
300615091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
3007dfbe8321SBarry Smith   PetscErrorCode ierr;
3008dfbe8321SBarry Smith   int        *diag = a->diag;
300915091d37SBarry Smith   MatScalar       *aa=a->a;
301087828ca2SBarry Smith   PetscScalar     *x,*b;
301187828ca2SBarry Smith   PetscScalar     s1,x1;
301215091d37SBarry Smith   MatScalar       *v;
301315091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
301415091d37SBarry Smith 
301515091d37SBarry Smith   PetscFunctionBegin;
30161ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30171ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
301815091d37SBarry Smith 
301915091d37SBarry Smith   /* forward solve the lower triangular */
302015091d37SBarry Smith   idx    = 0;
302115091d37SBarry Smith   x[0]   = b[0];
302215091d37SBarry Smith   for (i=1; i<n; i++) {
302315091d37SBarry Smith     v     =  aa      + ai[i];
302415091d37SBarry Smith     vi    =  aj      + ai[i];
302515091d37SBarry Smith     nz    =  diag[i] - ai[i];
302615091d37SBarry Smith     idx   +=  1;
3027f1af5d2fSBarry Smith     s1  =  b[idx];
302815091d37SBarry Smith     while (nz--) {
302915091d37SBarry Smith       jdx   = *vi++;
303015091d37SBarry Smith       x1    = x[jdx];
3031f1af5d2fSBarry Smith       s1 -= v[0]*x1;
303215091d37SBarry Smith       v    += 1;
303315091d37SBarry Smith     }
3034f1af5d2fSBarry Smith     x[idx]   = s1;
303515091d37SBarry Smith   }
303615091d37SBarry Smith   /* backward solve the upper triangular */
303715091d37SBarry Smith   for (i=n-1; i>=0; i--){
303815091d37SBarry Smith     v    = aa + diag[i] + 1;
303915091d37SBarry Smith     vi   = aj + diag[i] + 1;
304015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
304115091d37SBarry Smith     idt  = i;
3042f1af5d2fSBarry Smith     s1 = x[idt];
304315091d37SBarry Smith     while (nz--) {
304415091d37SBarry Smith       idx   = *vi++;
304515091d37SBarry Smith       x1    = x[idx];
3046f1af5d2fSBarry Smith       s1 -= v[0]*x1;
304715091d37SBarry Smith       v    += 1;
304815091d37SBarry Smith     }
304915091d37SBarry Smith     v        = aa +  diag[i];
3050f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
305115091d37SBarry Smith   }
30521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3054b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
305515091d37SBarry Smith   PetscFunctionReturn(0);
305615091d37SBarry Smith }
30574e2b4712SSatish Balay 
30584e2b4712SSatish Balay /* ----------------------------------------------------------------*/
30594e2b4712SSatish Balay /*
30604e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
30614e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
30624e2b4712SSatish Balay    Not a good example of code reuse.
30634e2b4712SSatish Balay */
3064dfbe8321SBarry Smith EXTERN PetscErrorCode MatMissingDiagonal_SeqBAIJ(Mat);
3065435faa5fSBarry Smith 
30664a2ae208SSatish Balay #undef __FUNCT__
30674a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3068dfbe8321SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
30694e2b4712SSatish Balay {
30704e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
30714e2b4712SSatish Balay   IS          isicol;
3072*6849ba73SBarry Smith   PetscErrorCode ierr;
3073*6849ba73SBarry Smith   int         *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
30744e2b4712SSatish Balay   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3075eb150c5cSKris Buschelman   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3076435faa5fSBarry Smith   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
30774533b203SBarry Smith   PetscTruth  col_identity,row_identity;
3078329f5518SBarry Smith   PetscReal   f;
30794e2b4712SSatish Balay 
30804e2b4712SSatish Balay   PetscFunctionBegin;
3081435faa5fSBarry Smith   f             = info->fill;
3082335d9088SBarry Smith   levels        = (int)info->levels;
3083335d9088SBarry Smith   diagonal_fill = (int)info->diagonal_fill;
30844c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3085667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3086667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3087309c388cSBarry Smith 
3088309c388cSBarry Smith   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3089bb3d539aSBarry Smith     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3090bb3d539aSBarry Smith     (*fact)->factor = FACTOR_LU;
3091bb3d539aSBarry Smith     b               = (Mat_SeqBAIJ*)(*fact)->data;
3092bb3d539aSBarry Smith     if (!b->diag) {
3093bb3d539aSBarry Smith       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3094bb3d539aSBarry Smith     }
3095bb3d539aSBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3096bb3d539aSBarry Smith     b->row        = isrow;
3097bb3d539aSBarry Smith     b->col        = iscol;
3098bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3099bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3100bb3d539aSBarry Smith     b->icol       = isicol;
3101bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
310287828ca2SBarry Smith     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3103309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
31044e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
31054e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
31064e2b4712SSatish Balay 
31074e2b4712SSatish Balay     /* get new row pointers */
3108b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
31094e2b4712SSatish Balay     ainew[0] = 0;
31104e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
31114e2b4712SSatish Balay     jmax = (int)(f*ai[n] + 1);
311282502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
31134e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
311482502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
31154e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3116b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
31174e2b4712SSatish Balay     /* im is level for each filled value */
3118b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
31194e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3120b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
31214e2b4712SSatish Balay     dloc[0]  = 0;
31224e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3123435faa5fSBarry Smith 
3124435faa5fSBarry Smith       /* copy prow into linked list */
31254e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
312629bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
31274e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
31284e2b4712SSatish Balay       fill[n]    = n;
3129435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
31304e2b4712SSatish Balay       while (nz--) {
31314e2b4712SSatish Balay 	fm  = n;
31324e2b4712SSatish Balay 	idx = ic[*xi++];
31334e2b4712SSatish Balay 	do {
31344e2b4712SSatish Balay 	  m  = fm;
31354e2b4712SSatish Balay 	  fm = fill[m];
31364e2b4712SSatish Balay 	} while (fm < idx);
31374e2b4712SSatish Balay 	fill[m]   = idx;
31384e2b4712SSatish Balay 	fill[idx] = fm;
31394e2b4712SSatish Balay 	im[idx]   = 0;
31404e2b4712SSatish Balay       }
3141435faa5fSBarry Smith 
3142435faa5fSBarry Smith       /* make sure diagonal entry is included */
3143435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3144435faa5fSBarry Smith 	fm = n;
3145435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3146435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3147435faa5fSBarry Smith 	fill[fm]   = prow;
3148435faa5fSBarry Smith 	im[prow]   = 0;
3149435faa5fSBarry Smith 	nzf++;
3150335d9088SBarry Smith 	dcount++;
3151435faa5fSBarry Smith       }
3152435faa5fSBarry Smith 
31534e2b4712SSatish Balay       nzi = 0;
31544e2b4712SSatish Balay       row = fill[n];
31554e2b4712SSatish Balay       while (row < prow) {
31564e2b4712SSatish Balay 	incrlev = im[row] + 1;
31574e2b4712SSatish Balay 	nz      = dloc[row];
3158435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
31594e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
31604e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
31614e2b4712SSatish Balay 	fm      = row;
31624e2b4712SSatish Balay 	while (nnz-- > 0) {
31634e2b4712SSatish Balay 	  idx = *xi++;
31644e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
31654e2b4712SSatish Balay 	    flev++;
31664e2b4712SSatish Balay 	    continue;
31674e2b4712SSatish Balay 	  }
31684e2b4712SSatish Balay 	  do {
31694e2b4712SSatish Balay 	    m  = fm;
31704e2b4712SSatish Balay 	    fm = fill[m];
31714e2b4712SSatish Balay 	  } while (fm < idx);
31724e2b4712SSatish Balay 	  if (fm != idx) {
31734e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
31744e2b4712SSatish Balay 	    fill[m]   = idx;
31754e2b4712SSatish Balay 	    fill[idx] = fm;
31764e2b4712SSatish Balay 	    fm        = idx;
31774e2b4712SSatish Balay 	    nzf++;
3178ecf371e4SBarry Smith 	  } else {
31794e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
31804e2b4712SSatish Balay 	  }
31814e2b4712SSatish Balay 	  flev++;
31824e2b4712SSatish Balay 	}
31834e2b4712SSatish Balay 	row = fill[row];
31844e2b4712SSatish Balay 	nzi++;
31854e2b4712SSatish Balay       }
31864e2b4712SSatish Balay       /* copy new filled row into permanent storage */
31874e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
31884e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3189ecf371e4SBarry Smith 
3190ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3191ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3192ecf371e4SBarry Smith 	/* just double the memory each time */
3193ecf371e4SBarry Smith 	int maxadd = jmax;
3194ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
31954e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
31964e2b4712SSatish Balay 	jmax += maxadd;
3197ecf371e4SBarry Smith 
3198ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
319982502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3200549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3201606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
32024e2b4712SSatish Balay 	ajnew = xi;
320382502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3204549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3205606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
32064e2b4712SSatish Balay 	ajfill = xi;
3207eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
32084e2b4712SSatish Balay       }
32094e2b4712SSatish Balay       xi          = ajnew + ainew[prow];
32104e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
32114e2b4712SSatish Balay       dloc[prow]  = nzi;
32124e2b4712SSatish Balay       fm          = fill[n];
32134e2b4712SSatish Balay       while (nzf--) {
32144e2b4712SSatish Balay 	*xi++   = fm;
32154e2b4712SSatish Balay 	*flev++ = im[fm];
32164e2b4712SSatish Balay 	fm      = fill[fm];
32174e2b4712SSatish Balay       }
3218435faa5fSBarry Smith       /* make sure row has diagonal entry */
3219435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
322029bbc08cSBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3221435faa5fSBarry Smith     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3222435faa5fSBarry Smith       }
32234e2b4712SSatish Balay     }
3224606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
32254e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
32264e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3227606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3228606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
32294e2b4712SSatish Balay 
32304e2b4712SSatish Balay     {
3231329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3232eb150c5cSKris Buschelman       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3233b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3234b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3235b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3236335d9088SBarry Smith       if (diagonal_fill) {
3237b1bcba4aSBarry Smith 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3238335d9088SBarry Smith       }
32394e2b4712SSatish Balay     }
32404e2b4712SSatish Balay 
32414e2b4712SSatish Balay     /* put together the new matrix */
3242f204ca49SKris Buschelman     ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr);
3243f204ca49SKris Buschelman     ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr);
3244f204ca49SKris Buschelman     ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr);
3245b0a32e0cSBarry Smith     PetscLogObjectParent(*fact,isicol);
32464e2b4712SSatish Balay     b = (Mat_SeqBAIJ*)(*fact)->data;
3247606d414cSSatish Balay     ierr = PetscFree(b->imax);CHKERRQ(ierr);
32487c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
32493f1db9ecSBarry Smith     len = bs2*ainew[n]*sizeof(MatScalar);
32504e2b4712SSatish Balay     /* the next line frees the default space generated by the Create() */
3251606d414cSSatish Balay     ierr = PetscFree(b->a);CHKERRQ(ierr);
3252606d414cSSatish Balay     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
325382502324SSatish Balay     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
32544e2b4712SSatish Balay     b->j          = ajnew;
32554e2b4712SSatish Balay     b->i          = ainew;
32564e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
32574e2b4712SSatish Balay     b->diag       = dloc;
32584e2b4712SSatish Balay     b->ilen       = 0;
32594e2b4712SSatish Balay     b->imax       = 0;
32604e2b4712SSatish Balay     b->row        = isrow;
32614e2b4712SSatish Balay     b->col        = iscol;
3262bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3263c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3264c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3265e51c0b9cSSatish Balay     b->icol       = isicol;
326687828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
32674e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
32684e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
326987828ca2SBarry Smith     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
32704e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
32714e2b4712SSatish Balay     (*fact)->factor   = FACTOR_LU;
32724e2b4712SSatish Balay 
3273eb150c5cSKris Buschelman     (*fact)->info.factor_mallocs    = reallocate;
32744e2b4712SSatish Balay     (*fact)->info.fill_ratio_given  = f;
3275329f5518SBarry Smith     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3276309c388cSBarry Smith   }
32774e2b4712SSatish Balay 
3278309c388cSBarry Smith   if (row_identity && col_identity) {
3279732ee342SKris Buschelman     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
32808661488fSKris Buschelman   }
32818661488fSKris Buschelman   PetscFunctionReturn(0);
32828661488fSKris Buschelman }
32838661488fSKris Buschelman 
3284732ee342SKris Buschelman #undef __FUNCT__
32857e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3286dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
32877e7071cdSKris Buschelman {
328812272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
328912272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
32905a9542e3SKris Buschelman   PetscFunctionBegin;
32917cf1b8d3SKris Buschelman   /* Undo Column scaling */
32927cf1b8d3SKris Buschelman /*    while (nz--) { */
32937cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
32947cf1b8d3SKris Buschelman /*    } */
3295c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3296c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
32977cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
32987cf1b8d3SKris Buschelman }
32997cf1b8d3SKris Buschelman 
33007cf1b8d3SKris Buschelman #undef __FUNCT__
33017cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3302dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
33037cf1b8d3SKris Buschelman {
33047cf1b8d3SKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
33050b9da03eSKris Buschelman   int *AJ=a->j,nz=a->nz;
33062aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
33075a9542e3SKris Buschelman   PetscFunctionBegin;
33080b9da03eSKris Buschelman   /* Is this really necessary? */
330920235379SKris Buschelman   while (nz--) {
33100b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
33117e7071cdSKris Buschelman   }
3312c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
33137e7071cdSKris Buschelman   PetscFunctionReturn(0);
33147e7071cdSKris Buschelman }
33157e7071cdSKris Buschelman 
33167e7071cdSKris Buschelman #undef __FUNCT__
3317732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3318dfbe8321SBarry Smith PetscErrorCode MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
33198661488fSKris Buschelman {
33208661488fSKris Buschelman   /*
33218661488fSKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
33228661488fSKris Buschelman       with natural ordering
33238661488fSKris Buschelman   */
33248661488fSKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
33258661488fSKris Buschelman 
33268661488fSKris Buschelman   PetscFunctionBegin;
3327a7ba9c3cSKris Buschelman   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3328a7ba9c3cSKris Buschelman   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
33298661488fSKris Buschelman   switch (a->bs) {
33308661488fSKris Buschelman   case 1:
33318661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3332732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3333732ee342SKris Buschelman     break;
3334309c388cSBarry Smith   case 2:
33358661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3336732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3337309c388cSBarry Smith     break;
3338309c388cSBarry Smith   case 3:
33398661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3340732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3341309c388cSBarry Smith     break;
3342309c388cSBarry Smith   case 4:
3343a7d8d0baSKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3344a7d8d0baSKris Buschelman     {
3345a7d8d0baSKris Buschelman       PetscTruth  sse_enabled_local;
3346dfbe8321SBarry Smith       PetscErrorCode ierr;
3347ccaa8a1bSKris Buschelman       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
33486b7cc795SKris Buschelman       if (sse_enabled_local) {
3349b988c221SKris Buschelman #  if defined(PETSC_HAVE_SSE)
33507cf1b8d3SKris Buschelman         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
33517cf1b8d3SKris Buschelman         if (n==(unsigned short)n) {
33522aa5897fSKris Buschelman           unsigned short *aj=(unsigned short *)AJ;
335313c7ffeeSKris Buschelman           for (i=0;i<nz;i++) {
33542aa5897fSKris Buschelman             aj[i] = (unsigned short)AJ[i];
335513c7ffeeSKris Buschelman           }
33567cf1b8d3SKris Buschelman           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
33577cf1b8d3SKris Buschelman           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
335886b4ebfeSKris Buschelman           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
33597cf1b8d3SKris Buschelman         } else {
33607cf1b8d3SKris Buschelman         /* Scale the column indices for easier indexing in MatSolve. */
33617cf1b8d3SKris Buschelman /*            for (i=0;i<nz;i++) { */
33627cf1b8d3SKris Buschelman /*              AJ[i] = AJ[i]*4; */
33637cf1b8d3SKris Buschelman /*            } */
33647e7071cdSKris Buschelman           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
33658661488fSKris Buschelman           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
336686b4ebfeSKris Buschelman           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
33677cf1b8d3SKris Buschelman         }
3368b988c221SKris Buschelman #  else
3369b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3370b988c221SKris Buschelman         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3371b988c221SKris Buschelman #  endif
33723ba47ebaSKris Buschelman       } else {
33738661488fSKris Buschelman         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3374732ee342SKris Buschelman         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
33753ba47ebaSKris Buschelman       }
3376a7d8d0baSKris Buschelman     }
3377a7d8d0baSKris Buschelman #else
3378a7d8d0baSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3379a7d8d0baSKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3380a7d8d0baSKris Buschelman #endif
3381309c388cSBarry Smith     break;
3382309c388cSBarry Smith   case 5:
33838661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3384732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3385309c388cSBarry Smith     break;
3386309c388cSBarry Smith   case 6:
33878661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3388732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3389309c388cSBarry Smith     break;
3390309c388cSBarry Smith   case 7:
33918661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3392732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3393309c388cSBarry Smith     break;
3394309c388cSBarry Smith   }
33954e2b4712SSatish Balay   PetscFunctionReturn(0);
33964e2b4712SSatish Balay }
3397732ee342SKris Buschelman 
3398732ee342SKris Buschelman #undef __FUNCT__
3399732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3400dfbe8321SBarry Smith PetscErrorCode MatSeqBAIJ_UpdateSolvers(Mat A)
3401732ee342SKris Buschelman {
3402732ee342SKris Buschelman   /*
3403732ee342SKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3404732ee342SKris Buschelman       with natural ordering
3405732ee342SKris Buschelman   */
3406732ee342SKris Buschelman   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3407732ee342SKris Buschelman   IS          row = a->row, col = a->col;
3408732ee342SKris Buschelman   PetscTruth  row_identity, col_identity;
340923c42b7cSKris Buschelman   PetscTruth  use_natural;
3410dfbe8321SBarry Smith   PetscErrorCode ierr;
3411732ee342SKris Buschelman 
3412732ee342SKris Buschelman   PetscFunctionBegin;
3413cf242676SKris Buschelman 
341494ee7fc8SKris Buschelman   use_natural = PETSC_FALSE;
341521360622SBarry Smith   if (row && col) {
3416732ee342SKris Buschelman     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3417732ee342SKris Buschelman     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3418732ee342SKris Buschelman 
3419732ee342SKris Buschelman     if (row_identity && col_identity) {
3420732ee342SKris Buschelman       use_natural = PETSC_TRUE;
3421732ee342SKris Buschelman     }
342221360622SBarry Smith   } else {
342321360622SBarry Smith     use_natural = PETSC_TRUE;
342421360622SBarry Smith   }
342521360622SBarry Smith 
3426732ee342SKris Buschelman   switch (a->bs) {
3427732ee342SKris Buschelman   case 1:
3428732ee342SKris Buschelman     if (use_natural) {
3429732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3430732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3431732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3432732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3433732ee342SKris Buschelman     } else {
3434732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1;
3435732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3436732ee342SKris Buschelman     }
3437732ee342SKris Buschelman     break;
3438732ee342SKris Buschelman   case 2:
3439732ee342SKris Buschelman     if (use_natural) {
3440732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3441732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3442732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3443732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3444732ee342SKris Buschelman     } else {
3445732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2;
3446732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3447732ee342SKris Buschelman     }
3448732ee342SKris Buschelman     break;
3449732ee342SKris Buschelman   case 3:
3450732ee342SKris Buschelman     if (use_natural) {
3451732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3452732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3453732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3454732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3455732ee342SKris Buschelman     } else {
3456732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3;
3457732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3458732ee342SKris Buschelman     }
3459732ee342SKris Buschelman     break;
3460732ee342SKris Buschelman   case 4:
3461f26ec98cSKris Buschelman     {
3462123145dfSKris Buschelman       PetscTruth sse_enabled_local;
3463ccaa8a1bSKris Buschelman       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3464732ee342SKris Buschelman       if (use_natural) {
34652859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3466f26ec98cSKris Buschelman         if (sse_enabled_local) { /* Natural + Single + SSE */
3467eb150c5cSKris Buschelman #  if defined(PETSC_HAVE_SSE)
3468995eb297SKris Buschelman           int n=a->mbs;
3469995eb297SKris Buschelman           if (n==(unsigned short)n) {
3470995eb297SKris Buschelman             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3471995eb297SKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3472995eb297SKris Buschelman           } else {
3473732ee342SKris Buschelman             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
347486b4ebfeSKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3475995eb297SKris Buschelman           }
3476eb150c5cSKris Buschelman #  else
3477eb150c5cSKris Buschelman           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3478eb150c5cSKris Buschelman           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3479eb150c5cSKris Buschelman #  endif
3480f26ec98cSKris Buschelman         } else { /* Natural + Single */
3481f26ec98cSKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3482123145dfSKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3483f26ec98cSKris Buschelman         }
34842859b196SKris Buschelman #else
34852859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3486123145dfSKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
34872859b196SKris Buschelman #endif
3488732ee342SKris Buschelman         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3489123145dfSKris Buschelman         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3490f26ec98cSKris Buschelman       } else { /* Arbitrary ordering */
34912859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3492f26ec98cSKris Buschelman         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3493eb150c5cSKris Buschelman #  if defined(PETSC_HAVE_SSE)
3494732ee342SKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3495732ee342SKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3496eb150c5cSKris Buschelman #  else
3497eb150c5cSKris Buschelman           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3498eb150c5cSKris Buschelman           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3499eb150c5cSKris Buschelman #  endif
3500f26ec98cSKris Buschelman         } else { /* Arbitrary + Single */
3501f26ec98cSKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3502f26ec98cSKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3503732ee342SKris Buschelman         }
35042859b196SKris Buschelman #else
35052859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4;
35062859b196SKris Buschelman #endif
3507732ee342SKris Buschelman         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3508732ee342SKris Buschelman       }
3509f26ec98cSKris Buschelman     }
3510732ee342SKris Buschelman     break;
3511732ee342SKris Buschelman   case 5:
3512732ee342SKris Buschelman     if (use_natural) {
3513732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3514732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3515732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3516732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3517732ee342SKris Buschelman     } else {
3518732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5;
3519732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3520732ee342SKris Buschelman     }
3521732ee342SKris Buschelman     break;
3522732ee342SKris Buschelman   case 6:
3523732ee342SKris Buschelman     if (use_natural) {
3524732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3525732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3526732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3527732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3528732ee342SKris Buschelman     } else {
3529732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6;
3530732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3531732ee342SKris Buschelman     }
3532732ee342SKris Buschelman     break;
3533732ee342SKris Buschelman   case 7:
3534732ee342SKris Buschelman     if (use_natural) {
3535732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3536732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3537732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3538732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3539732ee342SKris Buschelman     } else {
3540732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7;
3541732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3542732ee342SKris Buschelman     }
3543732ee342SKris Buschelman     break;
354431801e53SKris Buschelman   default:
354531801e53SKris Buschelman     A->ops->solve             = MatSolve_SeqBAIJ_N;
354631801e53SKris Buschelman     break;
3547732ee342SKris Buschelman   }
3548732ee342SKris Buschelman   PetscFunctionReturn(0);
3549732ee342SKris Buschelman }
3550732ee342SKris Buschelman 
3551732ee342SKris Buschelman #undef __FUNCT__
3552732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3553dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3554dfbe8321SBarry Smith   PetscErrorCode ierr;
3555732ee342SKris Buschelman 
3556732ee342SKris Buschelman   PetscFunctionBegin;
3557732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3558cf242676SKris Buschelman   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3559732ee342SKris Buschelman     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3560cf242676SKris Buschelman   } else {
3561cf242676SKris Buschelman     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3562cf242676SKris Buschelman   }
3563732ee342SKris Buschelman   PetscFunctionReturn(0);
3564732ee342SKris Buschelman }
3565732ee342SKris Buschelman 
3566732ee342SKris Buschelman #undef __FUNCT__
3567732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3568dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3569dfbe8321SBarry Smith   PetscErrorCode ierr;
3570732ee342SKris Buschelman 
3571732ee342SKris Buschelman   PetscFunctionBegin;
3572732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3573732ee342SKris Buschelman   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3574732ee342SKris Buschelman   PetscFunctionReturn(0);
3575732ee342SKris Buschelman }
3576732ee342SKris Buschelman 
3577732ee342SKris Buschelman 
3578