xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 719d5645761d844e4357b7ee00a3296dffe0b787)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
34e2b4712SSatish Balay /*
44e2b4712SSatish Balay     Factorization code for BAIJ format.
54e2b4712SSatish Balay */
64e2b4712SSatish Balay 
74e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
84e2b4712SSatish Balay #include "src/inline/ilu.h"
974c49faeSBarry Smith #include "src/inline/dot.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16dfbe8321SBarry Smith   PetscErrorCode ierr;
17690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18690b6cddSBarry Smith   PetscInt       *diag = a->diag;
19f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2087828ca2SBarry Smith   PetscScalar    s1,*x,*b;
21f1af5d2fSBarry Smith 
22f1af5d2fSBarry Smith   PetscFunctionBegin;
23ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26f1af5d2fSBarry Smith 
27f1af5d2fSBarry Smith   /* forward solve the U^T */
28f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
29f1af5d2fSBarry Smith 
30f1af5d2fSBarry Smith     v     = aa + diag[i];
31f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
32ef66eb69SBarry Smith     s1    = (*v++)*x[i];
33f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
34f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
35f1af5d2fSBarry Smith     while (nz--) {
36f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
37f1af5d2fSBarry Smith     }
38f1af5d2fSBarry Smith     x[i]   = s1;
39f1af5d2fSBarry Smith   }
40f1af5d2fSBarry Smith   /* backward solve the L^T */
41f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
42f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
43f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
44f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
45f1af5d2fSBarry Smith     s1   = x[i];
46f1af5d2fSBarry Smith     while (nz--) {
47f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
48f1af5d2fSBarry Smith     }
49f1af5d2fSBarry Smith   }
501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53f1af5d2fSBarry Smith   PetscFunctionReturn(0);
54f1af5d2fSBarry Smith }
55f1af5d2fSBarry Smith 
564a2ae208SSatish Balay #undef __FUNCT__
574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59f1af5d2fSBarry Smith {
60f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61dfbe8321SBarry Smith   PetscErrorCode ierr;
62690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
64f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6587828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6687828ca2SBarry Smith   PetscScalar    *x,*b;
67f1af5d2fSBarry Smith 
68f1af5d2fSBarry Smith   PetscFunctionBegin;
69ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
701ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72f1af5d2fSBarry Smith 
73f1af5d2fSBarry Smith   /* forward solve the U^T */
74f1af5d2fSBarry Smith   idx = 0;
75f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
76f1af5d2fSBarry Smith 
77f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
78f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
79ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
80f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
81f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
82f1af5d2fSBarry Smith     v += 4;
83f1af5d2fSBarry Smith 
84f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
85f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
86f1af5d2fSBarry Smith     while (nz--) {
87f1af5d2fSBarry Smith       oidx = 2*(*vi++);
88f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90f1af5d2fSBarry Smith       v  += 4;
91f1af5d2fSBarry Smith     }
92f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
93f1af5d2fSBarry Smith     idx += 2;
94f1af5d2fSBarry Smith   }
95f1af5d2fSBarry Smith   /* backward solve the L^T */
96f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
97f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
98f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
99f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
100f1af5d2fSBarry Smith     idt  = 2*i;
101f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
102f1af5d2fSBarry Smith     while (nz--) {
103f1af5d2fSBarry Smith       idx   = 2*(*vi--);
104f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106f1af5d2fSBarry Smith       v -= 4;
107f1af5d2fSBarry Smith     }
108f1af5d2fSBarry Smith   }
1091ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
112f1af5d2fSBarry Smith   PetscFunctionReturn(0);
113f1af5d2fSBarry Smith }
114f1af5d2fSBarry Smith 
1154a2ae208SSatish Balay #undef __FUNCT__
1164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118f1af5d2fSBarry Smith {
119f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120dfbe8321SBarry Smith   PetscErrorCode ierr;
121690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
123f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12487828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12587828ca2SBarry Smith   PetscScalar    *x,*b;
126f1af5d2fSBarry Smith 
127f1af5d2fSBarry Smith   PetscFunctionBegin;
128ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1291ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith   /* forward solve the U^T */
133f1af5d2fSBarry Smith   idx = 0;
134f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
135f1af5d2fSBarry Smith 
136f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
137f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
138ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142f1af5d2fSBarry Smith     v += 9;
143f1af5d2fSBarry Smith 
144f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
145f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
146f1af5d2fSBarry Smith     while (nz--) {
147f1af5d2fSBarry Smith       oidx = 3*(*vi++);
148f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151f1af5d2fSBarry Smith       v  += 9;
152f1af5d2fSBarry Smith     }
153f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154f1af5d2fSBarry Smith     idx += 3;
155f1af5d2fSBarry Smith   }
156f1af5d2fSBarry Smith   /* backward solve the L^T */
157f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
158f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
159f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
160f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
161f1af5d2fSBarry Smith     idt  = 3*i;
162f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163f1af5d2fSBarry Smith     while (nz--) {
164f1af5d2fSBarry Smith       idx   = 3*(*vi--);
165f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168f1af5d2fSBarry Smith       v -= 9;
169f1af5d2fSBarry Smith     }
170f1af5d2fSBarry Smith   }
1711ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
174f1af5d2fSBarry Smith   PetscFunctionReturn(0);
175f1af5d2fSBarry Smith }
176f1af5d2fSBarry Smith 
1774a2ae208SSatish Balay #undef __FUNCT__
1784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180f1af5d2fSBarry Smith {
181f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182dfbe8321SBarry Smith   PetscErrorCode ierr;
183690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
185f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18787828ca2SBarry Smith   PetscScalar    *x,*b;
188f1af5d2fSBarry Smith 
189f1af5d2fSBarry Smith   PetscFunctionBegin;
190ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith   /* forward solve the U^T */
195f1af5d2fSBarry Smith   idx = 0;
196f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
197f1af5d2fSBarry Smith 
198f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
199f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
200ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205f1af5d2fSBarry Smith     v += 16;
206f1af5d2fSBarry Smith 
207f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
208f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
209f1af5d2fSBarry Smith     while (nz--) {
210f1af5d2fSBarry Smith       oidx = 4*(*vi++);
211f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215f1af5d2fSBarry Smith       v  += 16;
216f1af5d2fSBarry Smith     }
217f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218f1af5d2fSBarry Smith     idx += 4;
219f1af5d2fSBarry Smith   }
220f1af5d2fSBarry Smith   /* backward solve the L^T */
221f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
222f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
223f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
224f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
225f1af5d2fSBarry Smith     idt  = 4*i;
226f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227f1af5d2fSBarry Smith     while (nz--) {
228f1af5d2fSBarry Smith       idx   = 4*(*vi--);
229f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233f1af5d2fSBarry Smith       v -= 16;
234f1af5d2fSBarry Smith     }
235f1af5d2fSBarry Smith   }
2361ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
239f1af5d2fSBarry Smith   PetscFunctionReturn(0);
240f1af5d2fSBarry Smith }
241f1af5d2fSBarry Smith 
2424a2ae208SSatish Balay #undef __FUNCT__
2434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245f1af5d2fSBarry Smith {
246f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247dfbe8321SBarry Smith   PetscErrorCode ierr;
248690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
250f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25287828ca2SBarry Smith   PetscScalar    *x,*b;
253f1af5d2fSBarry Smith 
254f1af5d2fSBarry Smith   PetscFunctionBegin;
255ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2561ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2571ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258f1af5d2fSBarry Smith 
259f1af5d2fSBarry Smith   /* forward solve the U^T */
260f1af5d2fSBarry Smith   idx = 0;
261f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
262f1af5d2fSBarry Smith 
263f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
264f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
265ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271f1af5d2fSBarry Smith     v += 25;
272f1af5d2fSBarry Smith 
273f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
274f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
275f1af5d2fSBarry Smith     while (nz--) {
276f1af5d2fSBarry Smith       oidx = 5*(*vi++);
277f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282f1af5d2fSBarry Smith       v  += 25;
283f1af5d2fSBarry Smith     }
284f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285f1af5d2fSBarry Smith     idx += 5;
286f1af5d2fSBarry Smith   }
287f1af5d2fSBarry Smith   /* backward solve the L^T */
288f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
289f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
290f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
291f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
292f1af5d2fSBarry Smith     idt  = 5*i;
293f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294f1af5d2fSBarry Smith     while (nz--) {
295f1af5d2fSBarry Smith       idx   = 5*(*vi--);
296f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301f1af5d2fSBarry Smith       v -= 25;
302f1af5d2fSBarry Smith     }
303f1af5d2fSBarry Smith   }
3041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
307f1af5d2fSBarry Smith   PetscFunctionReturn(0);
308f1af5d2fSBarry Smith }
309f1af5d2fSBarry Smith 
3104a2ae208SSatish Balay #undef __FUNCT__
3114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313f1af5d2fSBarry Smith {
314f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315dfbe8321SBarry Smith   PetscErrorCode ierr;
316690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
318f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
31987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32087828ca2SBarry Smith   PetscScalar    *x,*b;
321f1af5d2fSBarry Smith 
322f1af5d2fSBarry Smith   PetscFunctionBegin;
323ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326f1af5d2fSBarry Smith 
327f1af5d2fSBarry Smith   /* forward solve the U^T */
328f1af5d2fSBarry Smith   idx = 0;
329f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
330f1af5d2fSBarry Smith 
331f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
332f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
333ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334ef66eb69SBarry Smith     x6    = x[5+idx];
335f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341f1af5d2fSBarry Smith     v += 36;
342f1af5d2fSBarry Smith 
343f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
344f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
345f1af5d2fSBarry Smith     while (nz--) {
346f1af5d2fSBarry Smith       oidx = 6*(*vi++);
347f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353f1af5d2fSBarry Smith       v  += 36;
354f1af5d2fSBarry Smith     }
355f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356f1af5d2fSBarry Smith     x[5+idx] = s6;
357f1af5d2fSBarry Smith     idx += 6;
358f1af5d2fSBarry Smith   }
359f1af5d2fSBarry Smith   /* backward solve the L^T */
360f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
361f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
362f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
363f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
364f1af5d2fSBarry Smith     idt  = 6*i;
365f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366f1af5d2fSBarry Smith     s6 = x[5+idt];
367f1af5d2fSBarry Smith     while (nz--) {
368f1af5d2fSBarry Smith       idx   = 6*(*vi--);
369f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375f1af5d2fSBarry Smith       v -= 36;
376f1af5d2fSBarry Smith     }
377f1af5d2fSBarry Smith   }
3781ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
381f1af5d2fSBarry Smith   PetscFunctionReturn(0);
382f1af5d2fSBarry Smith }
383f1af5d2fSBarry Smith 
3844a2ae208SSatish Balay #undef __FUNCT__
3854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387f1af5d2fSBarry Smith {
388f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389dfbe8321SBarry Smith   PetscErrorCode ierr;
390690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
392f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39487828ca2SBarry Smith   PetscScalar    *x,*b;
395f1af5d2fSBarry Smith 
396f1af5d2fSBarry Smith   PetscFunctionBegin;
397ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3981ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3991ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400f1af5d2fSBarry Smith 
401f1af5d2fSBarry Smith   /* forward solve the U^T */
402f1af5d2fSBarry Smith   idx = 0;
403f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
404f1af5d2fSBarry Smith 
405f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
406f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
407ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
409f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416f1af5d2fSBarry Smith     v += 49;
417f1af5d2fSBarry Smith 
418f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
419f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
420f1af5d2fSBarry Smith     while (nz--) {
421f1af5d2fSBarry Smith       oidx = 7*(*vi++);
422f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429f1af5d2fSBarry Smith       v  += 49;
430f1af5d2fSBarry Smith     }
431f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
433f1af5d2fSBarry Smith     idx += 7;
434f1af5d2fSBarry Smith   }
435f1af5d2fSBarry Smith   /* backward solve the L^T */
436f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
437f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
438f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
439f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
440f1af5d2fSBarry Smith     idt  = 7*i;
441f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
443f1af5d2fSBarry Smith     while (nz--) {
444f1af5d2fSBarry Smith       idx   = 7*(*vi--);
445f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452f1af5d2fSBarry Smith       v -= 49;
453f1af5d2fSBarry Smith     }
454f1af5d2fSBarry Smith   }
4551ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457d0f46423SBarry Smith   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
458f1af5d2fSBarry Smith   PetscFunctionReturn(0);
459f1af5d2fSBarry Smith }
460f1af5d2fSBarry Smith 
461f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4624a2ae208SSatish Balay #undef __FUNCT__
4634a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465f1af5d2fSBarry Smith {
466f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4686849ba73SBarry Smith   PetscErrorCode ierr;
469690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
470690b6cddSBarry Smith   PetscInt       *diag = a->diag;
471f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47287828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
473f1af5d2fSBarry Smith 
474f1af5d2fSBarry Smith   PetscFunctionBegin;
4751ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477f1af5d2fSBarry Smith   t  = a->solve_work;
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
480f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
483f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
484f1af5d2fSBarry Smith     t[i] = b[c[i]];
485f1af5d2fSBarry Smith   }
486f1af5d2fSBarry Smith 
487f1af5d2fSBarry Smith   /* forward solve the U^T */
488f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith     v     = aa + diag[i];
491f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
492f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
493f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
494f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
495f1af5d2fSBarry Smith     while (nz--) {
496f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
497f1af5d2fSBarry Smith     }
498f1af5d2fSBarry Smith     t[i]   = s1;
499f1af5d2fSBarry Smith   }
500f1af5d2fSBarry Smith   /* backward solve the L^T */
501f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
502f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
503f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
504f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
505f1af5d2fSBarry Smith     s1   = t[i];
506f1af5d2fSBarry Smith     while (nz--) {
507f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
508f1af5d2fSBarry Smith     }
509f1af5d2fSBarry Smith   }
510f1af5d2fSBarry Smith 
511f1af5d2fSBarry Smith   /* copy t into x according to permutation */
512f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
513f1af5d2fSBarry Smith     x[r[i]]   = t[i];
514f1af5d2fSBarry Smith   }
515f1af5d2fSBarry Smith 
516f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
517f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5181ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5191ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
520d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
521f1af5d2fSBarry Smith   PetscFunctionReturn(0);
522f1af5d2fSBarry Smith }
523f1af5d2fSBarry Smith 
5244a2ae208SSatish Balay #undef __FUNCT__
5254a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
526dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527f1af5d2fSBarry Smith {
528f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5306849ba73SBarry Smith   PetscErrorCode ierr;
531690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
532690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
533f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53487828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
536f1af5d2fSBarry Smith 
537f1af5d2fSBarry Smith   PetscFunctionBegin;
5381ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5391ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
540f1af5d2fSBarry Smith   t  = a->solve_work;
541f1af5d2fSBarry Smith 
542f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
543f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
544f1af5d2fSBarry Smith 
545f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
546f1af5d2fSBarry Smith   ii = 0;
547f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
548f1af5d2fSBarry Smith     ic      = 2*c[i];
549f1af5d2fSBarry Smith     t[ii]   = b[ic];
550f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
551f1af5d2fSBarry Smith     ii += 2;
552f1af5d2fSBarry Smith   }
553f1af5d2fSBarry Smith 
554f1af5d2fSBarry Smith   /* forward solve the U^T */
555f1af5d2fSBarry Smith   idx = 0;
556f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
559f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
560f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
561f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
562f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
563f1af5d2fSBarry Smith     v += 4;
564f1af5d2fSBarry Smith 
565f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
566f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
567f1af5d2fSBarry Smith     while (nz--) {
568f1af5d2fSBarry Smith       oidx = 2*(*vi++);
569f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
570f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
571f1af5d2fSBarry Smith       v  += 4;
572f1af5d2fSBarry Smith     }
573f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
574f1af5d2fSBarry Smith     idx += 2;
575f1af5d2fSBarry Smith   }
576f1af5d2fSBarry Smith   /* backward solve the L^T */
577f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
578f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
579f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
580f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
581f1af5d2fSBarry Smith     idt  = 2*i;
582f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
583f1af5d2fSBarry Smith     while (nz--) {
584f1af5d2fSBarry Smith       idx   = 2*(*vi--);
585f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
586f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
587f1af5d2fSBarry Smith       v -= 4;
588f1af5d2fSBarry Smith     }
589f1af5d2fSBarry Smith   }
590f1af5d2fSBarry Smith 
591f1af5d2fSBarry Smith   /* copy t into x according to permutation */
592f1af5d2fSBarry Smith   ii = 0;
593f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
594f1af5d2fSBarry Smith     ir      = 2*r[i];
595f1af5d2fSBarry Smith     x[ir]   = t[ii];
596f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
597f1af5d2fSBarry Smith     ii += 2;
598f1af5d2fSBarry Smith   }
599f1af5d2fSBarry Smith 
600f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
601f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6021ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
604d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   PetscFunctionReturn(0);
606f1af5d2fSBarry Smith }
607f1af5d2fSBarry Smith 
6084a2ae208SSatish Balay #undef __FUNCT__
6094a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
610dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
611f1af5d2fSBarry Smith {
612f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
613f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6146849ba73SBarry Smith   PetscErrorCode ierr;
615690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
616690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
617f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
61887828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
61987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
620f1af5d2fSBarry Smith 
621f1af5d2fSBarry Smith   PetscFunctionBegin;
6221ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
624f1af5d2fSBarry Smith   t  = a->solve_work;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
627f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
628f1af5d2fSBarry Smith 
629f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
630f1af5d2fSBarry Smith   ii = 0;
631f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
632f1af5d2fSBarry Smith     ic      = 3*c[i];
633f1af5d2fSBarry Smith     t[ii]   = b[ic];
634f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
635f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
636f1af5d2fSBarry Smith     ii += 3;
637f1af5d2fSBarry Smith   }
638f1af5d2fSBarry Smith 
639f1af5d2fSBarry Smith   /* forward solve the U^T */
640f1af5d2fSBarry Smith   idx = 0;
641f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
642f1af5d2fSBarry Smith 
643f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
644f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
645f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
646f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
647f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
648f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
649f1af5d2fSBarry Smith     v += 9;
650f1af5d2fSBarry Smith 
651f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
652f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
653f1af5d2fSBarry Smith     while (nz--) {
654f1af5d2fSBarry Smith       oidx = 3*(*vi++);
655f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
656f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
657f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658f1af5d2fSBarry Smith       v  += 9;
659f1af5d2fSBarry Smith     }
660f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
661f1af5d2fSBarry Smith     idx += 3;
662f1af5d2fSBarry Smith   }
663f1af5d2fSBarry Smith   /* backward solve the L^T */
664f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
665f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
666f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
667f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
668f1af5d2fSBarry Smith     idt  = 3*i;
669f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
670f1af5d2fSBarry Smith     while (nz--) {
671f1af5d2fSBarry Smith       idx   = 3*(*vi--);
672f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
673f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
674f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
675f1af5d2fSBarry Smith       v -= 9;
676f1af5d2fSBarry Smith     }
677f1af5d2fSBarry Smith   }
678f1af5d2fSBarry Smith 
679f1af5d2fSBarry Smith   /* copy t into x according to permutation */
680f1af5d2fSBarry Smith   ii = 0;
681f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
682f1af5d2fSBarry Smith     ir      = 3*r[i];
683f1af5d2fSBarry Smith     x[ir]   = t[ii];
684f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
685f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
686f1af5d2fSBarry Smith     ii += 3;
687f1af5d2fSBarry Smith   }
688f1af5d2fSBarry Smith 
689f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
690f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
693d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
694f1af5d2fSBarry Smith   PetscFunctionReturn(0);
695f1af5d2fSBarry Smith }
696f1af5d2fSBarry Smith 
6974a2ae208SSatish Balay #undef __FUNCT__
6984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
699dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
700f1af5d2fSBarry Smith {
701f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
702f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7036849ba73SBarry Smith   PetscErrorCode ierr;
704690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
705690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
706f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
70787828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
70887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
709f1af5d2fSBarry Smith 
710f1af5d2fSBarry Smith   PetscFunctionBegin;
7111ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7121ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
713f1af5d2fSBarry Smith   t  = a->solve_work;
714f1af5d2fSBarry Smith 
715f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
716f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
717f1af5d2fSBarry Smith 
718f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
719f1af5d2fSBarry Smith   ii = 0;
720f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
721f1af5d2fSBarry Smith     ic      = 4*c[i];
722f1af5d2fSBarry Smith     t[ii]   = b[ic];
723f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
724f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
725f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
726f1af5d2fSBarry Smith     ii += 4;
727f1af5d2fSBarry Smith   }
728f1af5d2fSBarry Smith 
729f1af5d2fSBarry Smith   /* forward solve the U^T */
730f1af5d2fSBarry Smith   idx = 0;
731f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
732f1af5d2fSBarry Smith 
733f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
734f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
735f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
736f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
737f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
738f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
739f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
740f1af5d2fSBarry Smith     v += 16;
741f1af5d2fSBarry Smith 
742f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
743f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
744f1af5d2fSBarry Smith     while (nz--) {
745f1af5d2fSBarry Smith       oidx = 4*(*vi++);
746f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750f1af5d2fSBarry Smith       v  += 16;
751f1af5d2fSBarry Smith     }
752f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
753f1af5d2fSBarry Smith     idx += 4;
754f1af5d2fSBarry Smith   }
755f1af5d2fSBarry Smith   /* backward solve the L^T */
756f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
757f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
758f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
759f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
760f1af5d2fSBarry Smith     idt  = 4*i;
761f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
762f1af5d2fSBarry Smith     while (nz--) {
763f1af5d2fSBarry Smith       idx   = 4*(*vi--);
764f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
765f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
766f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
767f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
768f1af5d2fSBarry Smith       v -= 16;
769f1af5d2fSBarry Smith     }
770f1af5d2fSBarry Smith   }
771f1af5d2fSBarry Smith 
772f1af5d2fSBarry Smith   /* copy t into x according to permutation */
773f1af5d2fSBarry Smith   ii = 0;
774f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
775f1af5d2fSBarry Smith     ir      = 4*r[i];
776f1af5d2fSBarry Smith     x[ir]   = t[ii];
777f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
778f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
779f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
780f1af5d2fSBarry Smith     ii += 4;
781f1af5d2fSBarry Smith   }
782f1af5d2fSBarry Smith 
783f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
784f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7851ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7861ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
787d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
788f1af5d2fSBarry Smith   PetscFunctionReturn(0);
789f1af5d2fSBarry Smith }
790f1af5d2fSBarry Smith 
7914a2ae208SSatish Balay #undef __FUNCT__
7924a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
793dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
794f1af5d2fSBarry Smith {
795f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7976849ba73SBarry Smith   PetscErrorCode ierr;
798690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
799690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
800f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
803f1af5d2fSBarry Smith 
804f1af5d2fSBarry Smith   PetscFunctionBegin;
8051ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8061ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
807f1af5d2fSBarry Smith   t  = a->solve_work;
808f1af5d2fSBarry Smith 
809f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
810f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
811f1af5d2fSBarry Smith 
812f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
813f1af5d2fSBarry Smith   ii = 0;
814f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
815f1af5d2fSBarry Smith     ic      = 5*c[i];
816f1af5d2fSBarry Smith     t[ii]   = b[ic];
817f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
818f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
819f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
820f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
821f1af5d2fSBarry Smith     ii += 5;
822f1af5d2fSBarry Smith   }
823f1af5d2fSBarry Smith 
824f1af5d2fSBarry Smith   /* forward solve the U^T */
825f1af5d2fSBarry Smith   idx = 0;
826f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
827f1af5d2fSBarry Smith 
828f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
829f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
830f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
831f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
832f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
833f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
834f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
835f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
836f1af5d2fSBarry Smith     v += 25;
837f1af5d2fSBarry Smith 
838f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
839f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
840f1af5d2fSBarry Smith     while (nz--) {
841f1af5d2fSBarry Smith       oidx = 5*(*vi++);
842f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847f1af5d2fSBarry Smith       v  += 25;
848f1af5d2fSBarry Smith     }
849f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
850f1af5d2fSBarry Smith     idx += 5;
851f1af5d2fSBarry Smith   }
852f1af5d2fSBarry Smith   /* backward solve the L^T */
853f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
854f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
855f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
856f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
857f1af5d2fSBarry Smith     idt  = 5*i;
858f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
859f1af5d2fSBarry Smith     while (nz--) {
860f1af5d2fSBarry Smith       idx   = 5*(*vi--);
861f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
862f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
863f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
864f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
865f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
866f1af5d2fSBarry Smith       v -= 25;
867f1af5d2fSBarry Smith     }
868f1af5d2fSBarry Smith   }
869f1af5d2fSBarry Smith 
870f1af5d2fSBarry Smith   /* copy t into x according to permutation */
871f1af5d2fSBarry Smith   ii = 0;
872f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
873f1af5d2fSBarry Smith     ir      = 5*r[i];
874f1af5d2fSBarry Smith     x[ir]   = t[ii];
875f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
876f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
877f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
878f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
879f1af5d2fSBarry Smith     ii += 5;
880f1af5d2fSBarry Smith   }
881f1af5d2fSBarry Smith 
882f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
883f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8841ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8851ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
886d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
887f1af5d2fSBarry Smith   PetscFunctionReturn(0);
888f1af5d2fSBarry Smith }
889f1af5d2fSBarry Smith 
8904a2ae208SSatish Balay #undef __FUNCT__
8914a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
892dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
893f1af5d2fSBarry Smith {
894f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
895f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8966849ba73SBarry Smith   PetscErrorCode ierr;
897690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
898690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
899f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90087828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90187828ca2SBarry Smith   PetscScalar    *x,*b,*t;
902f1af5d2fSBarry Smith 
903f1af5d2fSBarry Smith   PetscFunctionBegin;
9041ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9051ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
906f1af5d2fSBarry Smith   t  = a->solve_work;
907f1af5d2fSBarry Smith 
908f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
909f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
912f1af5d2fSBarry Smith   ii = 0;
913f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
914f1af5d2fSBarry Smith     ic      = 6*c[i];
915f1af5d2fSBarry Smith     t[ii]   = b[ic];
916f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
917f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
918f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
919f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
920f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
921f1af5d2fSBarry Smith     ii += 6;
922f1af5d2fSBarry Smith   }
923f1af5d2fSBarry Smith 
924f1af5d2fSBarry Smith   /* forward solve the U^T */
925f1af5d2fSBarry Smith   idx = 0;
926f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
927f1af5d2fSBarry Smith 
928f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
929f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
930f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
931f1af5d2fSBarry Smith     x6    = t[5+idx];
932f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
933f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
934f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
935f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
936f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
937f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
938f1af5d2fSBarry Smith     v += 36;
939f1af5d2fSBarry Smith 
940f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
941f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
942f1af5d2fSBarry Smith     while (nz--) {
943f1af5d2fSBarry Smith       oidx = 6*(*vi++);
944f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
945f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
946f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
947f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
948f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
949f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
950f1af5d2fSBarry Smith       v  += 36;
951f1af5d2fSBarry Smith     }
952f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
953f1af5d2fSBarry Smith     t[5+idx] = s6;
954f1af5d2fSBarry Smith     idx += 6;
955f1af5d2fSBarry Smith   }
956f1af5d2fSBarry Smith   /* backward solve the L^T */
957f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
958f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
959f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
960f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
961f1af5d2fSBarry Smith     idt  = 6*i;
962f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
963f1af5d2fSBarry Smith     s6 = t[5+idt];
964f1af5d2fSBarry Smith     while (nz--) {
965f1af5d2fSBarry Smith       idx   = 6*(*vi--);
966f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
967f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
968f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
969f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
970f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
971f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
972f1af5d2fSBarry Smith       v -= 36;
973f1af5d2fSBarry Smith     }
974f1af5d2fSBarry Smith   }
975f1af5d2fSBarry Smith 
976f1af5d2fSBarry Smith   /* copy t into x according to permutation */
977f1af5d2fSBarry Smith   ii = 0;
978f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
979f1af5d2fSBarry Smith     ir      = 6*r[i];
980f1af5d2fSBarry Smith     x[ir]   = t[ii];
981f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
982f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
983f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
984f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
985f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
986f1af5d2fSBarry Smith     ii += 6;
987f1af5d2fSBarry Smith   }
988f1af5d2fSBarry Smith 
989f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
990f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
993d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
994f1af5d2fSBarry Smith   PetscFunctionReturn(0);
995f1af5d2fSBarry Smith }
996f1af5d2fSBarry Smith 
9974a2ae208SSatish Balay #undef __FUNCT__
9984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
999dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1000f1af5d2fSBarry Smith {
1001f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1002f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10036849ba73SBarry Smith   PetscErrorCode ierr;
1004690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
1005690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1006f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
100787828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
100887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1009f1af5d2fSBarry Smith 
1010f1af5d2fSBarry Smith   PetscFunctionBegin;
10111ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10121ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1013f1af5d2fSBarry Smith   t  = a->solve_work;
1014f1af5d2fSBarry Smith 
1015f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1016f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1017f1af5d2fSBarry Smith 
1018f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1019f1af5d2fSBarry Smith   ii = 0;
1020f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1021f1af5d2fSBarry Smith     ic      = 7*c[i];
1022f1af5d2fSBarry Smith     t[ii]   = b[ic];
1023f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1024f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1025f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1026f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1027f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1028f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1029f1af5d2fSBarry Smith     ii += 7;
1030f1af5d2fSBarry Smith   }
1031f1af5d2fSBarry Smith 
1032f1af5d2fSBarry Smith   /* forward solve the U^T */
1033f1af5d2fSBarry Smith   idx = 0;
1034f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1035f1af5d2fSBarry Smith 
1036f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1037f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1038f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1039f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1040f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1041f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1042f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1043f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1044f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1045f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1046f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1047f1af5d2fSBarry Smith     v += 49;
1048f1af5d2fSBarry Smith 
1049f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1050f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1051f1af5d2fSBarry Smith     while (nz--) {
1052f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1053f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1054f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1055f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1056f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1057f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1058f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1059f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1060f1af5d2fSBarry Smith       v  += 49;
1061f1af5d2fSBarry Smith     }
1062f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1063f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1064f1af5d2fSBarry Smith     idx += 7;
1065f1af5d2fSBarry Smith   }
1066f1af5d2fSBarry Smith   /* backward solve the L^T */
1067f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1068f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1069f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1070f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1071f1af5d2fSBarry Smith     idt  = 7*i;
1072f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1073f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1074f1af5d2fSBarry Smith     while (nz--) {
1075f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1076f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1077f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1078f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1079f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1080f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1081f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1082f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1083f1af5d2fSBarry Smith       v -= 49;
1084f1af5d2fSBarry Smith     }
1085f1af5d2fSBarry Smith   }
1086f1af5d2fSBarry Smith 
1087f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1088f1af5d2fSBarry Smith   ii = 0;
1089f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1090f1af5d2fSBarry Smith     ir      = 7*r[i];
1091f1af5d2fSBarry Smith     x[ir]   = t[ii];
1092f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1093f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1094f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1095f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1096f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1097f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1098f1af5d2fSBarry Smith     ii += 7;
1099f1af5d2fSBarry Smith   }
1100f1af5d2fSBarry Smith 
1101f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1102f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11031ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1105d0f46423SBarry Smith   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
1106f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1107f1af5d2fSBarry Smith }
1108f1af5d2fSBarry Smith 
11094e2b4712SSatish Balay /* ----------------------------------------------------------- */
11104a2ae208SSatish Balay #undef __FUNCT__
11114a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1112dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11134e2b4712SSatish Balay {
11144e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11154e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11166849ba73SBarry Smith   PetscErrorCode ierr;
1117690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1118d0f46423SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2,*rout,*cout;
11193f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
112087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11214e2b4712SSatish Balay 
11224e2b4712SSatish Balay   PetscFunctionBegin;
11231ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11241ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1125f1af5d2fSBarry Smith   t  = a->solve_work;
11264e2b4712SSatish Balay 
11274e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11284e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11294e2b4712SSatish Balay 
11304e2b4712SSatish Balay   /* forward solve the lower triangular */
113187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11324e2b4712SSatish Balay   for (i=1; i<n; i++) {
11334e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11344e2b4712SSatish Balay     vi  = aj + ai[i];
11354e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1136f1af5d2fSBarry Smith     s = t + bs*i;
113787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11384e2b4712SSatish Balay     while (nz--) {
1139f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11404e2b4712SSatish Balay       v += bs2;
11414e2b4712SSatish Balay     }
11424e2b4712SSatish Balay   }
11434e2b4712SSatish Balay   /* backward solve the upper triangular */
1144d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11454e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11464e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11474e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11484e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
114987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11504e2b4712SSatish Balay     while (nz--) {
1151f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11524e2b4712SSatish Balay       v += bs2;
11534e2b4712SSatish Balay     }
1154f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
115587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11564e2b4712SSatish Balay   }
11574e2b4712SSatish Balay 
11584e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11594e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11601ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11611ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1162d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11634e2b4712SSatish Balay   PetscFunctionReturn(0);
11644e2b4712SSatish Balay }
11654e2b4712SSatish Balay 
11664a2ae208SSatish Balay #undef __FUNCT__
11674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1168dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11694e2b4712SSatish Balay {
11704e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11714e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11726849ba73SBarry Smith   PetscErrorCode ierr;
1173690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1174690b6cddSBarry Smith   PetscInt       *diag = a->diag;
11753f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
117687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
117787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11784e2b4712SSatish Balay 
11794e2b4712SSatish Balay   PetscFunctionBegin;
11801ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1182f1af5d2fSBarry Smith   t  = a->solve_work;
11834e2b4712SSatish Balay 
11844e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11854e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11864e2b4712SSatish Balay 
11874e2b4712SSatish Balay   /* forward solve the lower triangular */
11884e2b4712SSatish Balay   idx    = 7*(*r++);
1189f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1190f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1191f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11924e2b4712SSatish Balay 
11934e2b4712SSatish Balay   for (i=1; i<n; i++) {
11944e2b4712SSatish Balay     v     = aa + 49*ai[i];
11954e2b4712SSatish Balay     vi    = aj + ai[i];
11964e2b4712SSatish Balay     nz    = diag[i] - ai[i];
11974e2b4712SSatish Balay     idx   = 7*(*r++);
1198f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1199f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12004e2b4712SSatish Balay     while (nz--) {
12014e2b4712SSatish Balay       idx   = 7*(*vi++);
1202f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1203f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1204f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1205f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1206f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1207f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1208f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1209f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1210f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1211f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12124e2b4712SSatish Balay       v += 49;
12134e2b4712SSatish Balay     }
12144e2b4712SSatish Balay     idx = 7*i;
1215f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1216f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1217f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12184e2b4712SSatish Balay   }
12194e2b4712SSatish Balay   /* backward solve the upper triangular */
12204e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12214e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12224e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12234e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12244e2b4712SSatish Balay     idt  = 7*i;
1225f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1226f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1227f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12284e2b4712SSatish Balay     while (nz--) {
12294e2b4712SSatish Balay       idx   = 7*(*vi++);
1230f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1231f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1232f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1233f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1234f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1235f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1236f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1237f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1238f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1239f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12404e2b4712SSatish Balay       v += 49;
12414e2b4712SSatish Balay     }
12424e2b4712SSatish Balay     idc = 7*(*c--);
12434e2b4712SSatish Balay     v   = aa + 49*diag[i];
1244f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1245f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1246f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1247f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1248f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1249f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1250f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1251f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1252f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1253f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1254f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1255f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1256f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1257f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12584e2b4712SSatish Balay   }
12594e2b4712SSatish Balay 
12604e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12614e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12621ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1264d0f46423SBarry Smith   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
12654e2b4712SSatish Balay   PetscFunctionReturn(0);
12664e2b4712SSatish Balay }
12674e2b4712SSatish Balay 
12684a2ae208SSatish Balay #undef __FUNCT__
12694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1270dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
127115091d37SBarry Smith {
127215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1273690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1274dfbe8321SBarry Smith   PetscErrorCode    ierr;
1275690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1276d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1277d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1278d9fead3dSBarry Smith   const PetscScalar *b;
127915091d37SBarry Smith 
128015091d37SBarry Smith   PetscFunctionBegin;
1281d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128315091d37SBarry Smith   /* forward solve the lower triangular */
128415091d37SBarry Smith   idx    = 0;
128515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
128615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
128715091d37SBarry Smith   x[6] = b[6+idx];
128815091d37SBarry Smith   for (i=1; i<n; i++) {
128915091d37SBarry Smith     v     =  aa + 49*ai[i];
129015091d37SBarry Smith     vi    =  aj + ai[i];
129115091d37SBarry Smith     nz    =  diag[i] - ai[i];
129215091d37SBarry Smith     idx   =  7*i;
1293f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1294f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1295f1af5d2fSBarry Smith     s7  =  b[6+idx];
129615091d37SBarry Smith     while (nz--) {
129715091d37SBarry Smith       jdx   = 7*(*vi++);
129815091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
129915091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
130015091d37SBarry Smith       x7    = x[6+jdx];
1301f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1302f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1303f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1304f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1305f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1306f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1307f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
130815091d37SBarry Smith       v += 49;
130915091d37SBarry Smith      }
1310f1af5d2fSBarry Smith     x[idx]   = s1;
1311f1af5d2fSBarry Smith     x[1+idx] = s2;
1312f1af5d2fSBarry Smith     x[2+idx] = s3;
1313f1af5d2fSBarry Smith     x[3+idx] = s4;
1314f1af5d2fSBarry Smith     x[4+idx] = s5;
1315f1af5d2fSBarry Smith     x[5+idx] = s6;
1316f1af5d2fSBarry Smith     x[6+idx] = s7;
131715091d37SBarry Smith   }
131815091d37SBarry Smith   /* backward solve the upper triangular */
131915091d37SBarry Smith   for (i=n-1; i>=0; i--){
132015091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
132115091d37SBarry Smith     vi   = aj + diag[i] + 1;
132215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
132315091d37SBarry Smith     idt  = 7*i;
1324f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1325f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1326f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1327f1af5d2fSBarry Smith     s7 = x[6+idt];
132815091d37SBarry Smith     while (nz--) {
132915091d37SBarry Smith       idx   = 7*(*vi++);
133015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
133115091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
133215091d37SBarry Smith       x7    = x[6+idx];
1333f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1334f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1335f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1336f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1337f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1338f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1339f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134015091d37SBarry Smith       v += 49;
134115091d37SBarry Smith     }
134215091d37SBarry Smith     v        = aa + 49*diag[i];
1343f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1344f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1345f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1346f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1347f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1348f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1349f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1350f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1351f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1352f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1353f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1354f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1355f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1356f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
135715091d37SBarry Smith   }
135815091d37SBarry Smith 
1359d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1361d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
136215091d37SBarry Smith   PetscFunctionReturn(0);
136315091d37SBarry Smith }
136415091d37SBarry Smith 
13654a2ae208SSatish Balay #undef __FUNCT__
13664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1367dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
136815091d37SBarry Smith {
136915091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
137015091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
13716849ba73SBarry Smith   PetscErrorCode    ierr;
1372690b6cddSBarry Smith   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1373690b6cddSBarry Smith   PetscInt          *diag = a->diag;
1374d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1375d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1376d9fead3dSBarry Smith   const PetscScalar *b;
137715091d37SBarry Smith   PetscFunctionBegin;
1378d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1380f1af5d2fSBarry Smith   t  = a->solve_work;
138115091d37SBarry Smith 
138215091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
138315091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
138415091d37SBarry Smith 
138515091d37SBarry Smith   /* forward solve the lower triangular */
138615091d37SBarry Smith   idx    = 6*(*r++);
1387f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1388f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1389f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
139015091d37SBarry Smith   for (i=1; i<n; i++) {
139115091d37SBarry Smith     v     = aa + 36*ai[i];
139215091d37SBarry Smith     vi    = aj + ai[i];
139315091d37SBarry Smith     nz    = diag[i] - ai[i];
139415091d37SBarry Smith     idx   = 6*(*r++);
1395f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1396f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
139715091d37SBarry Smith     while (nz--) {
139815091d37SBarry Smith       idx   = 6*(*vi++);
1399f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1400f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1401f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1402f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1403f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1404f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1405f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1406f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
140715091d37SBarry Smith       v += 36;
140815091d37SBarry Smith     }
140915091d37SBarry Smith     idx = 6*i;
1410f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1411f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1412f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
141315091d37SBarry Smith   }
141415091d37SBarry Smith   /* backward solve the upper triangular */
141515091d37SBarry Smith   for (i=n-1; i>=0; i--){
141615091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
141715091d37SBarry Smith     vi   = aj + diag[i] + 1;
141815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
141915091d37SBarry Smith     idt  = 6*i;
1420f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1421f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1422f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
142315091d37SBarry Smith     while (nz--) {
142415091d37SBarry Smith       idx   = 6*(*vi++);
1425f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1426f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1427f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1428f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1429f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1430f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1431f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1432f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1433f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
143415091d37SBarry Smith       v += 36;
143515091d37SBarry Smith     }
143615091d37SBarry Smith     idc = 6*(*c--);
143715091d37SBarry Smith     v   = aa + 36*diag[i];
1438f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1439f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1440f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1441f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1442f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1443f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1444f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1445f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1446f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1447f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1448f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1449f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
145015091d37SBarry Smith   }
145115091d37SBarry Smith 
145215091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
145315091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1454d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1456d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
145715091d37SBarry Smith   PetscFunctionReturn(0);
145815091d37SBarry Smith }
145915091d37SBarry Smith 
14604a2ae208SSatish Balay #undef __FUNCT__
14614a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1462dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
146315091d37SBarry Smith {
146415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1465690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1466dfbe8321SBarry Smith   PetscErrorCode    ierr;
1467690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1468d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1469d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1470d9fead3dSBarry Smith   const PetscScalar *b;
147115091d37SBarry Smith 
147215091d37SBarry Smith   PetscFunctionBegin;
1473d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
147515091d37SBarry Smith   /* forward solve the lower triangular */
147615091d37SBarry Smith   idx    = 0;
147715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
147815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
147915091d37SBarry Smith   for (i=1; i<n; i++) {
148015091d37SBarry Smith     v     =  aa + 36*ai[i];
148115091d37SBarry Smith     vi    =  aj + ai[i];
148215091d37SBarry Smith     nz    =  diag[i] - ai[i];
148315091d37SBarry Smith     idx   =  6*i;
1484f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1485f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
148615091d37SBarry Smith     while (nz--) {
148715091d37SBarry Smith       jdx   = 6*(*vi++);
148815091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
148915091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1490f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
149615091d37SBarry Smith       v += 36;
149715091d37SBarry Smith      }
1498f1af5d2fSBarry Smith     x[idx]   = s1;
1499f1af5d2fSBarry Smith     x[1+idx] = s2;
1500f1af5d2fSBarry Smith     x[2+idx] = s3;
1501f1af5d2fSBarry Smith     x[3+idx] = s4;
1502f1af5d2fSBarry Smith     x[4+idx] = s5;
1503f1af5d2fSBarry Smith     x[5+idx] = s6;
150415091d37SBarry Smith   }
150515091d37SBarry Smith   /* backward solve the upper triangular */
150615091d37SBarry Smith   for (i=n-1; i>=0; i--){
150715091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
150815091d37SBarry Smith     vi   = aj + diag[i] + 1;
150915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
151015091d37SBarry Smith     idt  = 6*i;
1511f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1512f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1513f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
151415091d37SBarry Smith     while (nz--) {
151515091d37SBarry Smith       idx   = 6*(*vi++);
151615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
151715091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1518f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1519f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1520f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1521f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1522f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1523f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
152415091d37SBarry Smith       v += 36;
152515091d37SBarry Smith     }
152615091d37SBarry Smith     v        = aa + 36*diag[i];
1527f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1528f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1529f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1530f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1531f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1532f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
153315091d37SBarry Smith   }
153415091d37SBarry Smith 
1535d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15361ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1537d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
153815091d37SBarry Smith   PetscFunctionReturn(0);
153915091d37SBarry Smith }
154015091d37SBarry Smith 
15414a2ae208SSatish Balay #undef __FUNCT__
15424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1543dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15444e2b4712SSatish Balay {
15454e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
15464e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
15476849ba73SBarry Smith   PetscErrorCode    ierr;
1548690b6cddSBarry Smith   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1549690b6cddSBarry Smith   PetscInt          *diag = a->diag;
1550d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1551d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1552d9fead3dSBarry Smith   const PetscScalar *b;
15534e2b4712SSatish Balay 
15544e2b4712SSatish Balay   PetscFunctionBegin;
1555d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15561ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1557f1af5d2fSBarry Smith   t  = a->solve_work;
15584e2b4712SSatish Balay 
15594e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15604e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15614e2b4712SSatish Balay 
15624e2b4712SSatish Balay   /* forward solve the lower triangular */
15634e2b4712SSatish Balay   idx    = 5*(*r++);
1564f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1565f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15664e2b4712SSatish Balay   for (i=1; i<n; i++) {
15674e2b4712SSatish Balay     v     = aa + 25*ai[i];
15684e2b4712SSatish Balay     vi    = aj + ai[i];
15694e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15704e2b4712SSatish Balay     idx   = 5*(*r++);
1571f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1572f1af5d2fSBarry Smith     s5  = b[4+idx];
15734e2b4712SSatish Balay     while (nz--) {
15744e2b4712SSatish Balay       idx   = 5*(*vi++);
1575f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1576f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1577f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15824e2b4712SSatish Balay       v += 25;
15834e2b4712SSatish Balay     }
15844e2b4712SSatish Balay     idx = 5*i;
1585f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1586f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15874e2b4712SSatish Balay   }
15884e2b4712SSatish Balay   /* backward solve the upper triangular */
15894e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15904e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15914e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15924e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
15934e2b4712SSatish Balay     idt  = 5*i;
1594f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1595f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
15964e2b4712SSatish Balay     while (nz--) {
15974e2b4712SSatish Balay       idx   = 5*(*vi++);
1598f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1599f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1600f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1601f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1602f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1603f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1604f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
16054e2b4712SSatish Balay       v += 25;
16064e2b4712SSatish Balay     }
16074e2b4712SSatish Balay     idc = 5*(*c--);
16084e2b4712SSatish Balay     v   = aa + 25*diag[i];
1609f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1610f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1611f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1612f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1613f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1614f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1615f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1616f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1617f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1618f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
16194e2b4712SSatish Balay   }
16204e2b4712SSatish Balay 
16214e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
16224e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1623d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16241ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1625d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
16264e2b4712SSatish Balay   PetscFunctionReturn(0);
16274e2b4712SSatish Balay }
16284e2b4712SSatish Balay 
16294a2ae208SSatish Balay #undef __FUNCT__
16304a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1631dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
163215091d37SBarry Smith {
163315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1634690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1635dfbe8321SBarry Smith   PetscErrorCode    ierr;
1636690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1637d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1638d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1639d9fead3dSBarry Smith   const PetscScalar *b;
164015091d37SBarry Smith 
164115091d37SBarry Smith   PetscFunctionBegin;
1642d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
164415091d37SBarry Smith   /* forward solve the lower triangular */
164515091d37SBarry Smith   idx    = 0;
164615091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
164715091d37SBarry Smith   for (i=1; i<n; i++) {
164815091d37SBarry Smith     v     =  aa + 25*ai[i];
164915091d37SBarry Smith     vi    =  aj + ai[i];
165015091d37SBarry Smith     nz    =  diag[i] - ai[i];
165115091d37SBarry Smith     idx   =  5*i;
1652f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
165315091d37SBarry Smith     while (nz--) {
165415091d37SBarry Smith       jdx   = 5*(*vi++);
165515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1656f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1657f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1658f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1659f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1660f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
166115091d37SBarry Smith       v    += 25;
166215091d37SBarry Smith     }
1663f1af5d2fSBarry Smith     x[idx]   = s1;
1664f1af5d2fSBarry Smith     x[1+idx] = s2;
1665f1af5d2fSBarry Smith     x[2+idx] = s3;
1666f1af5d2fSBarry Smith     x[3+idx] = s4;
1667f1af5d2fSBarry Smith     x[4+idx] = s5;
166815091d37SBarry Smith   }
166915091d37SBarry Smith   /* backward solve the upper triangular */
167015091d37SBarry Smith   for (i=n-1; i>=0; i--){
167115091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
167215091d37SBarry Smith     vi   = aj + diag[i] + 1;
167315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
167415091d37SBarry Smith     idt  = 5*i;
1675f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1676f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
167715091d37SBarry Smith     while (nz--) {
167815091d37SBarry Smith       idx   = 5*(*vi++);
167915091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1680f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1681f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1682f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1683f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1684f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
168515091d37SBarry Smith       v    += 25;
168615091d37SBarry Smith     }
168715091d37SBarry Smith     v        = aa + 25*diag[i];
1688f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1689f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1690f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1691f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1692f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
169315091d37SBarry Smith   }
169415091d37SBarry Smith 
1695d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16961ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1697d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
169815091d37SBarry Smith   PetscFunctionReturn(0);
169915091d37SBarry Smith }
170015091d37SBarry Smith 
17014a2ae208SSatish Balay #undef __FUNCT__
17024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1703dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
17044e2b4712SSatish Balay {
17054e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
17064e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
17076849ba73SBarry Smith   PetscErrorCode    ierr;
1708690b6cddSBarry Smith   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1709690b6cddSBarry Smith   PetscInt          *diag = a->diag;
1710d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1711d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1712d9fead3dSBarry Smith   const PetscScalar *b;
17134e2b4712SSatish Balay 
17144e2b4712SSatish Balay   PetscFunctionBegin;
1715d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1717f1af5d2fSBarry Smith   t  = a->solve_work;
17184e2b4712SSatish Balay 
17194e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17204e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
17214e2b4712SSatish Balay 
17224e2b4712SSatish Balay   /* forward solve the lower triangular */
17234e2b4712SSatish Balay   idx    = 4*(*r++);
1724f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1725f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
17264e2b4712SSatish Balay   for (i=1; i<n; i++) {
17274e2b4712SSatish Balay     v     = aa + 16*ai[i];
17284e2b4712SSatish Balay     vi    = aj + ai[i];
17294e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17304e2b4712SSatish Balay     idx   = 4*(*r++);
1731f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17324e2b4712SSatish Balay     while (nz--) {
17334e2b4712SSatish Balay       idx   = 4*(*vi++);
1734f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1735f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1736f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1737f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1738f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17394e2b4712SSatish Balay       v    += 16;
17404e2b4712SSatish Balay     }
17414e2b4712SSatish Balay     idx        = 4*i;
1742f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1743f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17444e2b4712SSatish Balay   }
17454e2b4712SSatish Balay   /* backward solve the upper triangular */
17464e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17474e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17484e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17494e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17504e2b4712SSatish Balay     idt  = 4*i;
1751f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1752f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17534e2b4712SSatish Balay     while (nz--) {
17544e2b4712SSatish Balay       idx   = 4*(*vi++);
1755f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1756f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1757f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1758f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1759f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1760f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17614e2b4712SSatish Balay       v += 16;
17624e2b4712SSatish Balay     }
17634e2b4712SSatish Balay     idc      = 4*(*c--);
17644e2b4712SSatish Balay     v        = aa + 16*diag[i];
1765f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1766f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1767f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1768f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17694e2b4712SSatish Balay   }
17704e2b4712SSatish Balay 
17714e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17724e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1773d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1775d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
17764e2b4712SSatish Balay   PetscFunctionReturn(0);
17774e2b4712SSatish Balay }
1778f26ec98cSKris Buschelman 
1779f26ec98cSKris Buschelman #undef __FUNCT__
1780f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1781dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1782f26ec98cSKris Buschelman {
1783f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1784f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
17856849ba73SBarry Smith   PetscErrorCode    ierr;
1786690b6cddSBarry Smith   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1787690b6cddSBarry Smith   PetscInt          *diag = a->diag;
1788d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1789d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1790d9fead3dSBarry Smith   PetscScalar       *x;
1791d9fead3dSBarry Smith   const PetscScalar *b;
1792f26ec98cSKris Buschelman 
1793f26ec98cSKris Buschelman   PetscFunctionBegin;
1794d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17951ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1796f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1797f26ec98cSKris Buschelman 
1798f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1799f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1800f26ec98cSKris Buschelman 
1801f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1802f26ec98cSKris Buschelman   idx    = 4*(*r++);
1803f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1804f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1805f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1806f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1807f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1808f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1809f26ec98cSKris Buschelman     vi    = aj + ai[i];
1810f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1811f26ec98cSKris Buschelman     idx   = 4*(*r++);
1812f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1813f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1814f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1815f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1816f26ec98cSKris Buschelman     while (nz--) {
1817f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1818f26ec98cSKris Buschelman       x1  = t[idx];
1819f26ec98cSKris Buschelman       x2  = t[1+idx];
1820f26ec98cSKris Buschelman       x3  = t[2+idx];
1821f26ec98cSKris Buschelman       x4  = t[3+idx];
1822f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1823f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1824f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1825f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1826f26ec98cSKris Buschelman       v    += 16;
1827f26ec98cSKris Buschelman     }
1828f26ec98cSKris Buschelman     idx        = 4*i;
1829f26ec98cSKris Buschelman     t[idx]   = s1;
1830f26ec98cSKris Buschelman     t[1+idx] = s2;
1831f26ec98cSKris Buschelman     t[2+idx] = s3;
1832f26ec98cSKris Buschelman     t[3+idx] = s4;
1833f26ec98cSKris Buschelman   }
1834f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1835f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1836f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1837f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1838f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1839f26ec98cSKris Buschelman     idt  = 4*i;
1840f26ec98cSKris Buschelman     s1 = t[idt];
1841f26ec98cSKris Buschelman     s2 = t[1+idt];
1842f26ec98cSKris Buschelman     s3 = t[2+idt];
1843f26ec98cSKris Buschelman     s4 = t[3+idt];
1844f26ec98cSKris Buschelman     while (nz--) {
1845f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1846f26ec98cSKris Buschelman       x1  = t[idx];
1847f26ec98cSKris Buschelman       x2  = t[1+idx];
1848f26ec98cSKris Buschelman       x3  = t[2+idx];
1849f26ec98cSKris Buschelman       x4  = t[3+idx];
1850f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1851f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1852f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1853f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1854f26ec98cSKris Buschelman       v += 16;
1855f26ec98cSKris Buschelman     }
1856f26ec98cSKris Buschelman     idc      = 4*(*c--);
1857f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1858f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1859f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1860f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1861f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1862f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1863f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1864f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1865f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1866f26ec98cSKris Buschelman  }
1867f26ec98cSKris Buschelman 
1868f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1869f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1870d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1872d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
1873f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1874f26ec98cSKris Buschelman }
1875f26ec98cSKris Buschelman 
187624c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
187724c233c2SKris Buschelman 
187824c233c2SKris Buschelman #include PETSC_HAVE_SSE
187924c233c2SKris Buschelman 
188024c233c2SKris Buschelman #undef __FUNCT__
188124c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1882dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
188324c233c2SKris Buschelman {
188424c233c2SKris Buschelman   /*
188524c233c2SKris Buschelman      Note: This code uses demotion of double
188624c233c2SKris Buschelman      to float when performing the mixed-mode computation.
188724c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
188824c233c2SKris Buschelman   */
188924c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
189024c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
18916849ba73SBarry Smith   PetscErrorCode ierr;
1892690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1893690b6cddSBarry Smith   PetscInt       *diag = a->diag,ai16;
189424c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
189587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
189624c233c2SKris Buschelman 
189724c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
189824c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
189924c233c2SKris Buschelman   unsigned long   offset;
190024c233c2SKris Buschelman 
190124c233c2SKris Buschelman   PetscFunctionBegin;
190224c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
190324c233c2SKris Buschelman 
190424c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
190524c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
190624c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
190724c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
190824c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
190924c233c2SKris Buschelman 
19101ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
19111ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
191224c233c2SKris Buschelman     t  = a->solve_work;
191324c233c2SKris Buschelman 
191424c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
191524c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
191624c233c2SKris Buschelman 
191724c233c2SKris Buschelman     /* forward solve the lower triangular */
191824c233c2SKris Buschelman     idx  = 4*(*r++);
191924c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
192024c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
192124c233c2SKris Buschelman     v    =  aa + 16*ai[1];
192224c233c2SKris Buschelman 
192324c233c2SKris Buschelman     for (i=1; i<n;) {
192424c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
192524c233c2SKris Buschelman       vi   =  aj      + ai[i];
192624c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
192724c233c2SKris Buschelman       idx  =  4*(*r++);
192824c233c2SKris Buschelman 
192924c233c2SKris Buschelman       /* Demote sum from double to float */
193024c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
193124c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
193224c233c2SKris Buschelman 
193324c233c2SKris Buschelman       while (nz--) {
193424c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
193524c233c2SKris Buschelman         idx = 4*(*vi++);
193624c233c2SKris Buschelman 
193724c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
193824c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
193924c233c2SKris Buschelman 
194024c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
194124c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
194224c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
194324c233c2SKris Buschelman 
194424c233c2SKris Buschelman           /* First Column */
194524c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
194624c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
194724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
194824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
194924c233c2SKris Buschelman 
195024c233c2SKris Buschelman           /* Second Column */
195124c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
195224c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
195324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
195424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
195524c233c2SKris Buschelman 
195624c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
195724c233c2SKris Buschelman 
195824c233c2SKris Buschelman           /* Third Column */
195924c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
196024c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
196124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
196224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
196324c233c2SKris Buschelman 
196424c233c2SKris Buschelman           /* Fourth Column */
196524c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
196624c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
196724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
196824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
196924c233c2SKris Buschelman         SSE_INLINE_END_2
197024c233c2SKris Buschelman 
197124c233c2SKris Buschelman         v  += 16;
197224c233c2SKris Buschelman       }
197324c233c2SKris Buschelman       idx = 4*i;
197424c233c2SKris Buschelman       v   = aa + 16*ai[++i];
197524c233c2SKris Buschelman       PREFETCH_NTA(v);
197624c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
197724c233c2SKris Buschelman 
197824c233c2SKris Buschelman       /* Promote result from float to double */
197924c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
198024c233c2SKris Buschelman     }
198124c233c2SKris Buschelman     /* backward solve the upper triangular */
198224c233c2SKris Buschelman     idt  = 4*(n-1);
198324c233c2SKris Buschelman     ai16 = 16*diag[n-1];
198424c233c2SKris Buschelman     v    = aa + ai16 + 16;
198524c233c2SKris Buschelman     for (i=n-1; i>=0;){
198624c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
198724c233c2SKris Buschelman       vi = aj + diag[i] + 1;
198824c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
198924c233c2SKris Buschelman 
199024c233c2SKris Buschelman       /* Demote accumulator from double to float */
199124c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
199224c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
199324c233c2SKris Buschelman 
199424c233c2SKris Buschelman       while (nz--) {
199524c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
199624c233c2SKris Buschelman         idx = 4*(*vi++);
199724c233c2SKris Buschelman 
199824c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
199924c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
200024c233c2SKris Buschelman 
200124c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
200224c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
200324c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
200424c233c2SKris Buschelman 
200524c233c2SKris Buschelman           /* First Column */
200624c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
200724c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
200824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
200924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
201024c233c2SKris Buschelman 
201124c233c2SKris Buschelman           /* Second Column */
201224c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
201324c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
201424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
201524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
201624c233c2SKris Buschelman 
201724c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
201824c233c2SKris Buschelman 
201924c233c2SKris Buschelman           /* Third Column */
202024c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
202124c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
202224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
202324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
202424c233c2SKris Buschelman 
202524c233c2SKris Buschelman           /* Fourth Column */
202624c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
202724c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
202824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
202924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
203024c233c2SKris Buschelman         SSE_INLINE_END_2
203124c233c2SKris Buschelman         v  += 16;
203224c233c2SKris Buschelman       }
203324c233c2SKris Buschelman       v    = aa + ai16;
203424c233c2SKris Buschelman       ai16 = 16*diag[--i];
203524c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
203624c233c2SKris Buschelman       /*
203724c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
203824c233c2SKris Buschelman          which was inverted as part of the factorization
203924c233c2SKris Buschelman       */
204024c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
204124c233c2SKris Buschelman         /* First Column */
204224c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
204324c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
204424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
204524c233c2SKris Buschelman 
204624c233c2SKris Buschelman         /* Second Column */
204724c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
204824c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
204924c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
205024c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
205124c233c2SKris Buschelman 
205224c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
205324c233c2SKris Buschelman 
205424c233c2SKris Buschelman         /* Third Column */
205524c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
205624c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
205724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
205824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
205924c233c2SKris Buschelman 
206024c233c2SKris Buschelman         /* Fourth Column */
206124c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
206224c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
206324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
206424c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
206524c233c2SKris Buschelman 
206624c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
206724c233c2SKris Buschelman       SSE_INLINE_END_3
206824c233c2SKris Buschelman 
206924c233c2SKris Buschelman       /* Promote solution from float to double */
207024c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
207124c233c2SKris Buschelman 
207224c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
207324c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
207424c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
207524c233c2SKris Buschelman       idc  = 4*(*c--);
207624c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
207724c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
207824c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
207924c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
208024c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
208124c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
208224c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
208324c233c2SKris Buschelman       SSE_INLINE_END_2
208424c233c2SKris Buschelman       v    = aa + ai16 + 16;
208524c233c2SKris Buschelman       idt -= 4;
208624c233c2SKris Buschelman     }
208724c233c2SKris Buschelman 
208824c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
208924c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20901ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
20911ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2092d0f46423SBarry Smith     ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
209324c233c2SKris Buschelman   SSE_SCOPE_END;
209424c233c2SKris Buschelman   PetscFunctionReturn(0);
209524c233c2SKris Buschelman }
209624c233c2SKris Buschelman 
209724c233c2SKris Buschelman #endif
20980ef38995SBarry Smith 
20990ef38995SBarry Smith 
21004e2b4712SSatish Balay /*
21014e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
21024e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
21034e2b4712SSatish Balay */
21044a2ae208SSatish Balay #undef __FUNCT__
21054a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2106dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
21074e2b4712SSatish Balay {
21084e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2109356650c2SBarry Smith   PetscInt          n=a->mbs;
2110356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2111dfbe8321SBarry Smith   PetscErrorCode    ierr;
2112356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2113d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2114d9fead3dSBarry Smith   PetscScalar       *x;
2115d9fead3dSBarry Smith   const PetscScalar *b;
21164e2b4712SSatish Balay 
21174e2b4712SSatish Balay   PetscFunctionBegin;
2118d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21204e2b4712SSatish Balay 
2121aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
21222853dc0eSBarry Smith   {
212387828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
21242853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
21252853dc0eSBarry Smith   }
2126aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
21272853dc0eSBarry Smith   {
212887828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
21292853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
21302853dc0eSBarry Smith   }
2131aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
21322853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2133e1293385SBarry Smith #else
213430d4dcafSBarry Smith   {
213587828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2136d9fead3dSBarry Smith     const MatScalar *v;
2137356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2138356650c2SBarry Smith     const PetscInt  *vi;
2139e1293385SBarry Smith 
21404e2b4712SSatish Balay   /* forward solve the lower triangular */
21414e2b4712SSatish Balay   idx    = 0;
2142e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
21434e2b4712SSatish Balay   for (i=1; i<n; i++) {
21444e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
21454e2b4712SSatish Balay     vi    =  aj      + ai[i];
21464e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2147e1293385SBarry Smith     idx   +=  4;
2148f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
21494e2b4712SSatish Balay     while (nz--) {
21504e2b4712SSatish Balay       jdx   = 4*(*vi++);
21514e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2152f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2153f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2154f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2155f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
21564e2b4712SSatish Balay       v    += 16;
21574e2b4712SSatish Balay     }
2158f1af5d2fSBarry Smith     x[idx]   = s1;
2159f1af5d2fSBarry Smith     x[1+idx] = s2;
2160f1af5d2fSBarry Smith     x[2+idx] = s3;
2161f1af5d2fSBarry Smith     x[3+idx] = s4;
21624e2b4712SSatish Balay   }
21634e2b4712SSatish Balay   /* backward solve the upper triangular */
21644e555682SBarry Smith   idt = 4*(n-1);
21654e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21664e555682SBarry Smith     ai16 = 16*diag[i];
21674e555682SBarry Smith     v    = aa + ai16 + 16;
21684e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21694e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2170f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2171f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
21724e2b4712SSatish Balay     while (nz--) {
21734e2b4712SSatish Balay       idx   = 4*(*vi++);
21744e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2175f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2176f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2177f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2178f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
21794e2b4712SSatish Balay       v    += 16;
21804e2b4712SSatish Balay     }
21814e555682SBarry Smith     v        = aa + ai16;
2182f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2183f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2184f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2185f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2186329f5518SBarry Smith     idt -= 4;
21874e2b4712SSatish Balay   }
218830d4dcafSBarry Smith   }
2189e1293385SBarry Smith #endif
21904e2b4712SSatish Balay 
2191d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2193d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
21944e2b4712SSatish Balay   PetscFunctionReturn(0);
21954e2b4712SSatish Balay }
21964e2b4712SSatish Balay 
2197f26ec98cSKris Buschelman #undef __FUNCT__
2198f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2199dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2200f26ec98cSKris Buschelman {
2201f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2202690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2203dfbe8321SBarry Smith   PetscErrorCode ierr;
2204690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2205f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2206f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2207f26ec98cSKris Buschelman 
2208f26ec98cSKris Buschelman   PetscFunctionBegin;
22091ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2211f26ec98cSKris Buschelman 
2212f26ec98cSKris Buschelman   {
2213f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2214f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2215690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2216f26ec98cSKris Buschelman 
2217f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2218f26ec98cSKris Buschelman     idx  = 0;
2219f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2220f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2221f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2222f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2223f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2224f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2225f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2226f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2227f26ec98cSKris Buschelman       idx   +=  4;
2228f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2229f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2230f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2231f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2232f26ec98cSKris Buschelman       while (nz--) {
2233f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2234f26ec98cSKris Buschelman         x1  = t[jdx];
2235f26ec98cSKris Buschelman         x2  = t[1+jdx];
2236f26ec98cSKris Buschelman         x3  = t[2+jdx];
2237f26ec98cSKris Buschelman         x4  = t[3+jdx];
2238f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2239f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2240f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2241f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2242f26ec98cSKris Buschelman         v    += 16;
2243f26ec98cSKris Buschelman       }
2244f26ec98cSKris Buschelman       t[idx]   = s1;
2245f26ec98cSKris Buschelman       t[1+idx] = s2;
2246f26ec98cSKris Buschelman       t[2+idx] = s3;
2247f26ec98cSKris Buschelman       t[3+idx] = s4;
2248f26ec98cSKris Buschelman     }
2249f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2250f26ec98cSKris Buschelman     idt = 4*(n-1);
2251f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2252f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2253f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2254f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2255f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2256f26ec98cSKris Buschelman       s1   = t[idt];
2257f26ec98cSKris Buschelman       s2   = t[1+idt];
2258f26ec98cSKris Buschelman       s3   = t[2+idt];
2259f26ec98cSKris Buschelman       s4   = t[3+idt];
2260f26ec98cSKris Buschelman       while (nz--) {
2261f26ec98cSKris Buschelman         idx = 4*(*vi++);
2262f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2263f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2264f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2265f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2266f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2267f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2268f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2269f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2270f26ec98cSKris Buschelman         v    += 16;
2271f26ec98cSKris Buschelman       }
2272f26ec98cSKris Buschelman       v        = aa + ai16;
2273f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2274f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2275f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2276f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2277f26ec98cSKris Buschelman       idt -= 4;
2278f26ec98cSKris Buschelman     }
2279f26ec98cSKris Buschelman   }
2280f26ec98cSKris Buschelman 
22811ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
22821ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2283d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2284f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2285f26ec98cSKris Buschelman }
2286f26ec98cSKris Buschelman 
22873660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
22883660e330SKris Buschelman 
22893660e330SKris Buschelman #include PETSC_HAVE_SSE
22903660e330SKris Buschelman #undef __FUNCT__
22917cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2292dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
22933660e330SKris Buschelman {
22943660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
22952aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2296dfbe8321SBarry Smith   PetscErrorCode ierr;
2297dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
22983660e330SKris Buschelman   MatScalar      *aa=a->a;
229987828ca2SBarry Smith   PetscScalar    *x,*b;
23003660e330SKris Buschelman 
23013660e330SKris Buschelman   PetscFunctionBegin;
23023660e330SKris Buschelman   SSE_SCOPE_BEGIN;
23033660e330SKris Buschelman   /*
23043660e330SKris Buschelman      Note: This code currently uses demotion of double
23053660e330SKris Buschelman      to float when performing the mixed-mode computation.
23063660e330SKris Buschelman      This may not be numerically reasonable for all applications.
23073660e330SKris Buschelman   */
23083660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
23093660e330SKris Buschelman 
23101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23123660e330SKris Buschelman   {
2313eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2314eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
23152aa5897fSKris Buschelman     int            nz,i,idt,ai16;
23162aa5897fSKris Buschelman     unsigned int   jdx,idx;
23172aa5897fSKris Buschelman     unsigned short *vi;
2318eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
23193660e330SKris Buschelman 
2320eb05f457SKris Buschelman     /* First block is the identity. */
23213660e330SKris Buschelman     idx  = 0;
2322eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
23232aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
23243660e330SKris Buschelman 
23253660e330SKris Buschelman     for (i=1; i<n;) {
23263660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23273660e330SKris Buschelman       vi   =  aj      + ai[i];
23283660e330SKris Buschelman       nz   =  diag[i] - ai[i];
23293660e330SKris Buschelman       idx +=  4;
23303660e330SKris Buschelman 
2331eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2332eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2333eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
23343660e330SKris Buschelman 
23353660e330SKris Buschelman       while (nz--) {
23363660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23372aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
23383660e330SKris Buschelman 
23393660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2340eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
23413660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23423660e330SKris Buschelman 
23433660e330SKris Buschelman           /* First Column */
23443660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23453660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23463660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23473660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23483660e330SKris Buschelman 
23493660e330SKris Buschelman           /* Second Column */
23503660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23513660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23523660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23533660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23543660e330SKris Buschelman 
23553660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23563660e330SKris Buschelman 
23573660e330SKris Buschelman           /* Third Column */
23583660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23593660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23603660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23613660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23623660e330SKris Buschelman 
23633660e330SKris Buschelman           /* Fourth Column */
23643660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23653660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23663660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23673660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23683660e330SKris Buschelman         SSE_INLINE_END_2
23693660e330SKris Buschelman 
23703660e330SKris Buschelman         v  += 16;
23713660e330SKris Buschelman       }
23723660e330SKris Buschelman       v    =  aa + 16*ai[++i];
23733660e330SKris Buschelman       PREFETCH_NTA(v);
2374eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
23753660e330SKris Buschelman     }
2376eb05f457SKris Buschelman 
2377eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2378eb05f457SKris Buschelman 
23793660e330SKris Buschelman     idt  = 4*(n-1);
23803660e330SKris Buschelman     ai16 = 16*diag[n-1];
23813660e330SKris Buschelman     v    = aa + ai16 + 16;
23823660e330SKris Buschelman     for (i=n-1; i>=0;){
23833660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23843660e330SKris Buschelman       vi = aj + diag[i] + 1;
23853660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
23863660e330SKris Buschelman 
2387eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
23883660e330SKris Buschelman 
23893660e330SKris Buschelman       while (nz--) {
23903660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23912aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
23923660e330SKris Buschelman 
23933660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2394eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
23953660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23963660e330SKris Buschelman 
23973660e330SKris Buschelman           /* First Column */
23983660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23993660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
24003660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
24013660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
24023660e330SKris Buschelman 
24033660e330SKris Buschelman           /* Second Column */
24043660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
24053660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
24063660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
24073660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
24083660e330SKris Buschelman 
24093660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
24103660e330SKris Buschelman 
24113660e330SKris Buschelman           /* Third Column */
24123660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24133660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24143660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24153660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24163660e330SKris Buschelman 
24173660e330SKris Buschelman           /* Fourth Column */
24183660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
24193660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
24203660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
24213660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
24223660e330SKris Buschelman         SSE_INLINE_END_2
24233660e330SKris Buschelman         v  += 16;
24243660e330SKris Buschelman       }
24253660e330SKris Buschelman       v    = aa + ai16;
24263660e330SKris Buschelman       ai16 = 16*diag[--i];
24273660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
24283660e330SKris Buschelman       /*
24293660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
24303660e330SKris Buschelman          which was inverted as part of the factorization
24313660e330SKris Buschelman       */
2432eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
24333660e330SKris Buschelman         /* First Column */
24343660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
24353660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
24363660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
24373660e330SKris Buschelman 
24383660e330SKris Buschelman         /* Second Column */
24393660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
24403660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
24413660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
24423660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
24433660e330SKris Buschelman 
24443660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
24453660e330SKris Buschelman 
24463660e330SKris Buschelman         /* Third Column */
24473660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
24483660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
24493660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
24503660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
24513660e330SKris Buschelman 
24523660e330SKris Buschelman         /* Fourth Column */
24533660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
24543660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
24553660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
24563660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
24573660e330SKris Buschelman 
24583660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
24593660e330SKris Buschelman       SSE_INLINE_END_3
24603660e330SKris Buschelman 
24613660e330SKris Buschelman       v    = aa + ai16 + 16;
24623660e330SKris Buschelman       idt -= 4;
24633660e330SKris Buschelman     }
2464eb05f457SKris Buschelman 
2465eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2466eb05f457SKris Buschelman     idt = 4*(n-1);
2467eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2468eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2469eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2470eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2471eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2472eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2473eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2474eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2475eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
247654693613SKris Buschelman       idt -= 4;
24773660e330SKris Buschelman     }
2478eb05f457SKris Buschelman 
2479eb05f457SKris Buschelman   } /* End of artificial scope. */
24801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
24811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2482d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
24833660e330SKris Buschelman   SSE_SCOPE_END;
24843660e330SKris Buschelman   PetscFunctionReturn(0);
24853660e330SKris Buschelman }
24863660e330SKris Buschelman 
24877cf1b8d3SKris Buschelman #undef __FUNCT__
24887cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2489dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
24907cf1b8d3SKris Buschelman {
24917cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
24927cf1b8d3SKris Buschelman   int            *aj=a->j;
2493dfbe8321SBarry Smith   PetscErrorCode ierr;
2494dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
24957cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
24967cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
24977cf1b8d3SKris Buschelman 
24987cf1b8d3SKris Buschelman   PetscFunctionBegin;
24997cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
25007cf1b8d3SKris Buschelman   /*
25017cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
25027cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
25037cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
25047cf1b8d3SKris Buschelman   */
25057cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
25067cf1b8d3SKris Buschelman 
25071ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25081ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25097cf1b8d3SKris Buschelman   {
25107cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
25117cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
25127cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
25137cf1b8d3SKris Buschelman     int       jdx,idx;
25147cf1b8d3SKris Buschelman     int       *vi;
25157cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
25167cf1b8d3SKris Buschelman 
25177cf1b8d3SKris Buschelman     /* First block is the identity. */
25187cf1b8d3SKris Buschelman     idx  = 0;
25197cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
25207cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
25217cf1b8d3SKris Buschelman 
25227cf1b8d3SKris Buschelman     for (i=1; i<n;) {
25237cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25247cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
25257cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
25267cf1b8d3SKris Buschelman       idx +=  4;
25277cf1b8d3SKris Buschelman 
25287cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
25297cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
25307cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
25317cf1b8d3SKris Buschelman 
25327cf1b8d3SKris Buschelman       while (nz--) {
25337cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25347cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
25357cf1b8d3SKris Buschelman /*          jdx = *vi++; */
25367cf1b8d3SKris Buschelman 
25377cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
25387cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
25397cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25407cf1b8d3SKris Buschelman 
25417cf1b8d3SKris Buschelman           /* First Column */
25427cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25437cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25447cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25457cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25467cf1b8d3SKris Buschelman 
25477cf1b8d3SKris Buschelman           /* Second Column */
25487cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25497cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25507cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25517cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25527cf1b8d3SKris Buschelman 
25537cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25547cf1b8d3SKris Buschelman 
25557cf1b8d3SKris Buschelman           /* Third Column */
25567cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25577cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25587cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25597cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25607cf1b8d3SKris Buschelman 
25617cf1b8d3SKris Buschelman           /* Fourth Column */
25627cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25637cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25647cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25657cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25667cf1b8d3SKris Buschelman         SSE_INLINE_END_2
25677cf1b8d3SKris Buschelman 
25687cf1b8d3SKris Buschelman         v  += 16;
25697cf1b8d3SKris Buschelman       }
25707cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
25717cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
25727cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
25737cf1b8d3SKris Buschelman     }
25747cf1b8d3SKris Buschelman 
25757cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
25767cf1b8d3SKris Buschelman 
25777cf1b8d3SKris Buschelman     idt  = 4*(n-1);
25787cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
25797cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
25807cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
25817cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25827cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
25837cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
25847cf1b8d3SKris Buschelman 
25857cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
25867cf1b8d3SKris Buschelman 
25877cf1b8d3SKris Buschelman       while (nz--) {
25887cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25897cf1b8d3SKris Buschelman         idx = 4*(*vi++);
25907cf1b8d3SKris Buschelman /*          idx = *vi++; */
25917cf1b8d3SKris Buschelman 
25927cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
25937cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
25947cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25957cf1b8d3SKris Buschelman 
25967cf1b8d3SKris Buschelman           /* First Column */
25977cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25987cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25997cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26007cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26017cf1b8d3SKris Buschelman 
26027cf1b8d3SKris Buschelman           /* Second Column */
26037cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26047cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26057cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26067cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26077cf1b8d3SKris Buschelman 
26087cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26097cf1b8d3SKris Buschelman 
26107cf1b8d3SKris Buschelman           /* Third Column */
26117cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26127cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26137cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26147cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26157cf1b8d3SKris Buschelman 
26167cf1b8d3SKris Buschelman           /* Fourth Column */
26177cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26187cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26197cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26207cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26217cf1b8d3SKris Buschelman         SSE_INLINE_END_2
26227cf1b8d3SKris Buschelman         v  += 16;
26237cf1b8d3SKris Buschelman       }
26247cf1b8d3SKris Buschelman       v    = aa + ai16;
26257cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
26267cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
26277cf1b8d3SKris Buschelman       /*
26287cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
26297cf1b8d3SKris Buschelman          which was inverted as part of the factorization
26307cf1b8d3SKris Buschelman       */
26317cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
26327cf1b8d3SKris Buschelman         /* First Column */
26337cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
26347cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
26357cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
26367cf1b8d3SKris Buschelman 
26377cf1b8d3SKris Buschelman         /* Second Column */
26387cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
26397cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
26407cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
26417cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
26427cf1b8d3SKris Buschelman 
26437cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
26447cf1b8d3SKris Buschelman 
26457cf1b8d3SKris Buschelman         /* Third Column */
26467cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
26477cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
26487cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
26497cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
26507cf1b8d3SKris Buschelman 
26517cf1b8d3SKris Buschelman         /* Fourth Column */
26527cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
26537cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
26547cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
26557cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
26567cf1b8d3SKris Buschelman 
26577cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
26587cf1b8d3SKris Buschelman       SSE_INLINE_END_3
26597cf1b8d3SKris Buschelman 
26607cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
26617cf1b8d3SKris Buschelman       idt -= 4;
26627cf1b8d3SKris Buschelman     }
26637cf1b8d3SKris Buschelman 
26647cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
26657cf1b8d3SKris Buschelman     idt = 4*(n-1);
26667cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
26677cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
26687cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
26697cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
26707cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
26717cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
26727cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
26737cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
26747cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
26757cf1b8d3SKris Buschelman       idt -= 4;
26767cf1b8d3SKris Buschelman     }
26777cf1b8d3SKris Buschelman 
26787cf1b8d3SKris Buschelman   } /* End of artificial scope. */
26791ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
26801ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2681d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
26827cf1b8d3SKris Buschelman   SSE_SCOPE_END;
26837cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
26847cf1b8d3SKris Buschelman }
26857cf1b8d3SKris Buschelman 
26863660e330SKris Buschelman #endif
26874a2ae208SSatish Balay #undef __FUNCT__
26884a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2689dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
26904e2b4712SSatish Balay {
26914e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
26924e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
26936849ba73SBarry Smith   PetscErrorCode    ierr;
2694690b6cddSBarry Smith   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2695690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2696d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2697d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2698d9fead3dSBarry Smith   const PetscScalar *b;
26994e2b4712SSatish Balay 
27004e2b4712SSatish Balay   PetscFunctionBegin;
2701d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27021ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2703f1af5d2fSBarry Smith   t  = a->solve_work;
27044e2b4712SSatish Balay 
27054e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
27064e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
27074e2b4712SSatish Balay 
27084e2b4712SSatish Balay   /* forward solve the lower triangular */
27094e2b4712SSatish Balay   idx    = 3*(*r++);
2710f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
27114e2b4712SSatish Balay   for (i=1; i<n; i++) {
27124e2b4712SSatish Balay     v     = aa + 9*ai[i];
27134e2b4712SSatish Balay     vi    = aj + ai[i];
27144e2b4712SSatish Balay     nz    = diag[i] - ai[i];
27154e2b4712SSatish Balay     idx   = 3*(*r++);
2716f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
27174e2b4712SSatish Balay     while (nz--) {
27184e2b4712SSatish Balay       idx   = 3*(*vi++);
2719f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2720f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2721f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2722f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27234e2b4712SSatish Balay       v += 9;
27244e2b4712SSatish Balay     }
27254e2b4712SSatish Balay     idx = 3*i;
2726f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
27274e2b4712SSatish Balay   }
27284e2b4712SSatish Balay   /* backward solve the upper triangular */
27294e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
27304e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
27314e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
27324e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
27334e2b4712SSatish Balay     idt  = 3*i;
2734f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
27354e2b4712SSatish Balay     while (nz--) {
27364e2b4712SSatish Balay       idx   = 3*(*vi++);
2737f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2738f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2739f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2740f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27414e2b4712SSatish Balay       v += 9;
27424e2b4712SSatish Balay     }
27434e2b4712SSatish Balay     idc = 3*(*c--);
27444e2b4712SSatish Balay     v   = aa + 9*diag[i];
2745f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2746f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2747f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
27484e2b4712SSatish Balay   }
27494e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27504e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2751d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27521ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2753d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
27544e2b4712SSatish Balay   PetscFunctionReturn(0);
27554e2b4712SSatish Balay }
27564e2b4712SSatish Balay 
275715091d37SBarry Smith /*
275815091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
275915091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
276015091d37SBarry Smith */
27614a2ae208SSatish Balay #undef __FUNCT__
27624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2763dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
276415091d37SBarry Smith {
276515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2766690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2767dfbe8321SBarry Smith   PetscErrorCode    ierr;
2768690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2769d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2770d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2771d9fead3dSBarry Smith   const PetscScalar *b;
2772690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
277315091d37SBarry Smith 
277415091d37SBarry Smith   PetscFunctionBegin;
2775d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
277715091d37SBarry Smith 
277815091d37SBarry Smith   /* forward solve the lower triangular */
277915091d37SBarry Smith   idx    = 0;
278015091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
278115091d37SBarry Smith   for (i=1; i<n; i++) {
278215091d37SBarry Smith     v     =  aa      + 9*ai[i];
278315091d37SBarry Smith     vi    =  aj      + ai[i];
278415091d37SBarry Smith     nz    =  diag[i] - ai[i];
278515091d37SBarry Smith     idx   +=  3;
2786f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
278715091d37SBarry Smith     while (nz--) {
278815091d37SBarry Smith       jdx   = 3*(*vi++);
278915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2790f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2791f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2792f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
279315091d37SBarry Smith       v    += 9;
279415091d37SBarry Smith     }
2795f1af5d2fSBarry Smith     x[idx]   = s1;
2796f1af5d2fSBarry Smith     x[1+idx] = s2;
2797f1af5d2fSBarry Smith     x[2+idx] = s3;
279815091d37SBarry Smith   }
279915091d37SBarry Smith   /* backward solve the upper triangular */
280015091d37SBarry Smith   for (i=n-1; i>=0; i--){
280115091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
280215091d37SBarry Smith     vi   = aj + diag[i] + 1;
280315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
280415091d37SBarry Smith     idt  = 3*i;
2805f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2806f1af5d2fSBarry Smith     s3 = x[2+idt];
280715091d37SBarry Smith     while (nz--) {
280815091d37SBarry Smith       idx   = 3*(*vi++);
280915091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2810f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2811f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2812f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
281315091d37SBarry Smith       v    += 9;
281415091d37SBarry Smith     }
281515091d37SBarry Smith     v        = aa +  9*diag[i];
2816f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2817f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2818f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
281915091d37SBarry Smith   }
282015091d37SBarry Smith 
2821d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2823d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
282415091d37SBarry Smith   PetscFunctionReturn(0);
282515091d37SBarry Smith }
282615091d37SBarry Smith 
28274a2ae208SSatish Balay #undef __FUNCT__
28284a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2829dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
28304e2b4712SSatish Balay {
28314e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
28324e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
28336849ba73SBarry Smith   PetscErrorCode    ierr;
2834690b6cddSBarry Smith   PetscInt          *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
2835690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2836d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2837d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
2838d9fead3dSBarry Smith   const PetscScalar *b;
28394e2b4712SSatish Balay 
28404e2b4712SSatish Balay   PetscFunctionBegin;
2841d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28421ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2843f1af5d2fSBarry Smith   t  = a->solve_work;
28444e2b4712SSatish Balay 
28454e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28464e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28474e2b4712SSatish Balay 
28484e2b4712SSatish Balay   /* forward solve the lower triangular */
28494e2b4712SSatish Balay   idx    = 2*(*r++);
2850f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
28514e2b4712SSatish Balay   for (i=1; i<n; i++) {
28524e2b4712SSatish Balay     v     = aa + 4*ai[i];
28534e2b4712SSatish Balay     vi    = aj + ai[i];
28544e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28554e2b4712SSatish Balay     idx   = 2*(*r++);
2856f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
28574e2b4712SSatish Balay     while (nz--) {
28584e2b4712SSatish Balay       idx   = 2*(*vi++);
2859f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2860f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2861f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28624e2b4712SSatish Balay       v += 4;
28634e2b4712SSatish Balay     }
28644e2b4712SSatish Balay     idx = 2*i;
2865f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
28664e2b4712SSatish Balay   }
28674e2b4712SSatish Balay   /* backward solve the upper triangular */
28684e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28694e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
28704e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28714e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28724e2b4712SSatish Balay     idt  = 2*i;
2873f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
28744e2b4712SSatish Balay     while (nz--) {
28754e2b4712SSatish Balay       idx   = 2*(*vi++);
2876f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2877f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2878f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28794e2b4712SSatish Balay       v += 4;
28804e2b4712SSatish Balay     }
28814e2b4712SSatish Balay     idc = 2*(*c--);
28824e2b4712SSatish Balay     v   = aa + 4*diag[i];
2883f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2884f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
28854e2b4712SSatish Balay   }
28864e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28874e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2888d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2890d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
28914e2b4712SSatish Balay   PetscFunctionReturn(0);
28924e2b4712SSatish Balay }
28934e2b4712SSatish Balay 
289415091d37SBarry Smith /*
289515091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
289615091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
289715091d37SBarry Smith */
28984a2ae208SSatish Balay #undef __FUNCT__
28994a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2900dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
290115091d37SBarry Smith {
290215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2903690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2904dfbe8321SBarry Smith   PetscErrorCode    ierr;
2905690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2906d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2907d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
2908d9fead3dSBarry Smith   const PetscScalar *b;
2909690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
291015091d37SBarry Smith 
291115091d37SBarry Smith   PetscFunctionBegin;
2912d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
291415091d37SBarry Smith 
291515091d37SBarry Smith   /* forward solve the lower triangular */
291615091d37SBarry Smith   idx    = 0;
291715091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
291815091d37SBarry Smith   for (i=1; i<n; i++) {
291915091d37SBarry Smith     v     =  aa      + 4*ai[i];
292015091d37SBarry Smith     vi    =  aj      + ai[i];
292115091d37SBarry Smith     nz    =  diag[i] - ai[i];
292215091d37SBarry Smith     idx   +=  2;
2923f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
292415091d37SBarry Smith     while (nz--) {
292515091d37SBarry Smith       jdx   = 2*(*vi++);
292615091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2927f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2928f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
292915091d37SBarry Smith       v    += 4;
293015091d37SBarry Smith     }
2931f1af5d2fSBarry Smith     x[idx]   = s1;
2932f1af5d2fSBarry Smith     x[1+idx] = s2;
293315091d37SBarry Smith   }
293415091d37SBarry Smith   /* backward solve the upper triangular */
293515091d37SBarry Smith   for (i=n-1; i>=0; i--){
293615091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
293715091d37SBarry Smith     vi   = aj + diag[i] + 1;
293815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
293915091d37SBarry Smith     idt  = 2*i;
2940f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
294115091d37SBarry Smith     while (nz--) {
294215091d37SBarry Smith       idx   = 2*(*vi++);
294315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2944f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2945f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
294615091d37SBarry Smith       v    += 4;
294715091d37SBarry Smith     }
294815091d37SBarry Smith     v        = aa +  4*diag[i];
2949f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2950f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
295115091d37SBarry Smith   }
295215091d37SBarry Smith 
2953d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29541ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2955d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
295615091d37SBarry Smith   PetscFunctionReturn(0);
295715091d37SBarry Smith }
295815091d37SBarry Smith 
29594a2ae208SSatish Balay #undef __FUNCT__
29604a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2961dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
29624e2b4712SSatish Balay {
29634e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
29644e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
29656849ba73SBarry Smith   PetscErrorCode ierr;
2966690b6cddSBarry Smith   PetscInt       *r,*c,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
2967690b6cddSBarry Smith   PetscInt       *diag = a->diag;
29683f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
296987828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
29704e2b4712SSatish Balay 
29714e2b4712SSatish Balay   PetscFunctionBegin;
29724e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
29734e2b4712SSatish Balay 
29741ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
29751ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2976f1af5d2fSBarry Smith   t  = a->solve_work;
29774e2b4712SSatish Balay 
29784e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29794e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29804e2b4712SSatish Balay 
29814e2b4712SSatish Balay   /* forward solve the lower triangular */
2982f1af5d2fSBarry Smith   t[0] = b[*r++];
29834e2b4712SSatish Balay   for (i=1; i<n; i++) {
29844e2b4712SSatish Balay     v     = aa + ai[i];
29854e2b4712SSatish Balay     vi    = aj + ai[i];
29864e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2987f1af5d2fSBarry Smith     s1  = b[*r++];
29884e2b4712SSatish Balay     while (nz--) {
2989f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29904e2b4712SSatish Balay     }
2991f1af5d2fSBarry Smith     t[i] = s1;
29924e2b4712SSatish Balay   }
29934e2b4712SSatish Balay   /* backward solve the upper triangular */
29944e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
29954e2b4712SSatish Balay     v    = aa + diag[i] + 1;
29964e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
29974e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2998f1af5d2fSBarry Smith     s1 = t[i];
29994e2b4712SSatish Balay     while (nz--) {
3000f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
30014e2b4712SSatish Balay     }
3002f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
30034e2b4712SSatish Balay   }
30044e2b4712SSatish Balay 
30054e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30064e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30071ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30081ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3009d0f46423SBarry Smith   ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
30104e2b4712SSatish Balay   PetscFunctionReturn(0);
30114e2b4712SSatish Balay }
301215091d37SBarry Smith /*
301315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
301415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
301515091d37SBarry Smith */
30164a2ae208SSatish Balay #undef __FUNCT__
30174a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3018dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
301915091d37SBarry Smith {
302015091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3021690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3022dfbe8321SBarry Smith   PetscErrorCode ierr;
3023690b6cddSBarry Smith   PetscInt       *diag = a->diag;
302415091d37SBarry Smith   MatScalar      *aa=a->a;
302587828ca2SBarry Smith   PetscScalar    *x,*b;
302687828ca2SBarry Smith   PetscScalar    s1,x1;
302715091d37SBarry Smith   MatScalar      *v;
3028690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
302915091d37SBarry Smith 
303015091d37SBarry Smith   PetscFunctionBegin;
30311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
303315091d37SBarry Smith 
303415091d37SBarry Smith   /* forward solve the lower triangular */
303515091d37SBarry Smith   idx    = 0;
303615091d37SBarry Smith   x[0]   = b[0];
303715091d37SBarry Smith   for (i=1; i<n; i++) {
303815091d37SBarry Smith     v     =  aa      + ai[i];
303915091d37SBarry Smith     vi    =  aj      + ai[i];
304015091d37SBarry Smith     nz    =  diag[i] - ai[i];
304115091d37SBarry Smith     idx   +=  1;
3042f1af5d2fSBarry Smith     s1  =  b[idx];
304315091d37SBarry Smith     while (nz--) {
304415091d37SBarry Smith       jdx   = *vi++;
304515091d37SBarry Smith       x1    = x[jdx];
3046f1af5d2fSBarry Smith       s1 -= v[0]*x1;
304715091d37SBarry Smith       v    += 1;
304815091d37SBarry Smith     }
3049f1af5d2fSBarry Smith     x[idx]   = s1;
305015091d37SBarry Smith   }
305115091d37SBarry Smith   /* backward solve the upper triangular */
305215091d37SBarry Smith   for (i=n-1; i>=0; i--){
305315091d37SBarry Smith     v    = aa + diag[i] + 1;
305415091d37SBarry Smith     vi   = aj + diag[i] + 1;
305515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
305615091d37SBarry Smith     idt  = i;
3057f1af5d2fSBarry Smith     s1 = x[idt];
305815091d37SBarry Smith     while (nz--) {
305915091d37SBarry Smith       idx   = *vi++;
306015091d37SBarry Smith       x1    = x[idx];
3061f1af5d2fSBarry Smith       s1 -= v[0]*x1;
306215091d37SBarry Smith       v    += 1;
306315091d37SBarry Smith     }
306415091d37SBarry Smith     v        = aa +  diag[i];
3065f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
306615091d37SBarry Smith   }
30671ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3069d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
307015091d37SBarry Smith   PetscFunctionReturn(0);
307115091d37SBarry Smith }
30724e2b4712SSatish Balay 
30734e2b4712SSatish Balay /* ----------------------------------------------------------------*/
30744e2b4712SSatish Balay /*
30754e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
30764e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
30774e2b4712SSatish Balay    Not a good example of code reuse.
30784e2b4712SSatish Balay */
3079*719d5645SBarry Smith EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3080db4efbfdSBarry Smith EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3081435faa5fSBarry Smith 
30824a2ae208SSatish Balay #undef __FUNCT__
30834a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3084*719d5645SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,MatFactorInfo *info)
30854e2b4712SSatish Balay {
30864e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
30874e2b4712SSatish Balay   IS             isicol;
30886849ba73SBarry Smith   PetscErrorCode ierr;
3089690b6cddSBarry Smith   PetscInt       *r,*ic,prow,n = a->mbs,*ai = a->i,*aj = a->j;
3090690b6cddSBarry Smith   PetscInt       *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3091a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3092d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
30932af78befSBarry Smith   PetscTruth     col_identity,row_identity,flg;
3094329f5518SBarry Smith   PetscReal      f;
30954e2b4712SSatish Balay 
30964e2b4712SSatish Balay   PetscFunctionBegin;
3097435faa5fSBarry Smith   f             = info->fill;
3098690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
3099690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
31004c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3101667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3102667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3103309c388cSBarry Smith 
3104309c388cSBarry Smith   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3105*719d5645SBarry Smith     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3106*719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
3107*719d5645SBarry Smith     b               = (Mat_SeqBAIJ*)(fact)->data;
3108*719d5645SBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);CHKERRQ(ierr);
31092af78befSBarry Smith     if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3110bb3d539aSBarry Smith     b->row        = isrow;
3111bb3d539aSBarry Smith     b->col        = iscol;
3112bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3113bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3114bb3d539aSBarry Smith     b->icol       = isicol;
3115bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3116*719d5645SBarry Smith     ierr          = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3117309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
31184e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
31194e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
31204e2b4712SSatish Balay 
31214e2b4712SSatish Balay     /* get new row pointers */
3122690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
31234e2b4712SSatish Balay     ainew[0] = 0;
31244e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
3125690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
3126690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
31274e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
3128690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
31294e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3130690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
31314e2b4712SSatish Balay     /* im is level for each filled value */
3132690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
31334e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3134690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
31354e2b4712SSatish Balay     dloc[0]  = 0;
31364e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3137435faa5fSBarry Smith 
3138435faa5fSBarry Smith       /* copy prow into linked list */
31394e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
314029bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
31414e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
31424e2b4712SSatish Balay       fill[n]    = n;
3143435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
31444e2b4712SSatish Balay       while (nz--) {
31454e2b4712SSatish Balay 	fm  = n;
31464e2b4712SSatish Balay 	idx = ic[*xi++];
31474e2b4712SSatish Balay 	do {
31484e2b4712SSatish Balay 	  m  = fm;
31494e2b4712SSatish Balay 	  fm = fill[m];
31504e2b4712SSatish Balay 	} while (fm < idx);
31514e2b4712SSatish Balay 	fill[m]   = idx;
31524e2b4712SSatish Balay 	fill[idx] = fm;
31534e2b4712SSatish Balay 	im[idx]   = 0;
31544e2b4712SSatish Balay       }
3155435faa5fSBarry Smith 
3156435faa5fSBarry Smith       /* make sure diagonal entry is included */
3157435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3158435faa5fSBarry Smith 	fm = n;
3159435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3160435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3161435faa5fSBarry Smith 	fill[fm]   = prow;
3162435faa5fSBarry Smith 	im[prow]   = 0;
3163435faa5fSBarry Smith 	nzf++;
3164335d9088SBarry Smith 	dcount++;
3165435faa5fSBarry Smith       }
3166435faa5fSBarry Smith 
31674e2b4712SSatish Balay       nzi = 0;
31684e2b4712SSatish Balay       row = fill[n];
31694e2b4712SSatish Balay       while (row < prow) {
31704e2b4712SSatish Balay 	incrlev = im[row] + 1;
31714e2b4712SSatish Balay 	nz      = dloc[row];
3172435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
31734e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
31744e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
31754e2b4712SSatish Balay 	fm      = row;
31764e2b4712SSatish Balay 	while (nnz-- > 0) {
31774e2b4712SSatish Balay 	  idx = *xi++;
31784e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
31794e2b4712SSatish Balay 	    flev++;
31804e2b4712SSatish Balay 	    continue;
31814e2b4712SSatish Balay 	  }
31824e2b4712SSatish Balay 	  do {
31834e2b4712SSatish Balay 	    m  = fm;
31844e2b4712SSatish Balay 	    fm = fill[m];
31854e2b4712SSatish Balay 	  } while (fm < idx);
31864e2b4712SSatish Balay 	  if (fm != idx) {
31874e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
31884e2b4712SSatish Balay 	    fill[m]   = idx;
31894e2b4712SSatish Balay 	    fill[idx] = fm;
31904e2b4712SSatish Balay 	    fm        = idx;
31914e2b4712SSatish Balay 	    nzf++;
3192ecf371e4SBarry Smith 	  } else {
31934e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
31944e2b4712SSatish Balay 	  }
31954e2b4712SSatish Balay 	  flev++;
31964e2b4712SSatish Balay 	}
31974e2b4712SSatish Balay 	row = fill[row];
31984e2b4712SSatish Balay 	nzi++;
31994e2b4712SSatish Balay       }
32004e2b4712SSatish Balay       /* copy new filled row into permanent storage */
32014e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
32024e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3203ecf371e4SBarry Smith 
3204ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3205ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3206ecf371e4SBarry Smith 	/* just double the memory each time */
3207690b6cddSBarry Smith 	PetscInt maxadd = jmax;
3208ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
32094e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
32104e2b4712SSatish Balay 	jmax += maxadd;
3211ecf371e4SBarry Smith 
3212ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
3213690b6cddSBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3214690b6cddSBarry Smith 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3215606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
32164e2b4712SSatish Balay 	ajnew = xi;
3217690b6cddSBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xi);CHKERRQ(ierr);
3218690b6cddSBarry Smith 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3219606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
32204e2b4712SSatish Balay 	ajfill = xi;
3221eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
32224e2b4712SSatish Balay       }
32234e2b4712SSatish Balay       xi          = ajnew + ainew[prow];
32244e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
32254e2b4712SSatish Balay       dloc[prow]  = nzi;
32264e2b4712SSatish Balay       fm          = fill[n];
32274e2b4712SSatish Balay       while (nzf--) {
32284e2b4712SSatish Balay 	*xi++   = fm;
32294e2b4712SSatish Balay 	*flev++ = im[fm];
32304e2b4712SSatish Balay 	fm      = fill[fm];
32314e2b4712SSatish Balay       }
3232435faa5fSBarry Smith       /* make sure row has diagonal entry */
3233435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
323477431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
32352401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3236435faa5fSBarry Smith       }
32374e2b4712SSatish Balay     }
3238606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
32394e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
32404e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3241606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3242606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
32434e2b4712SSatish Balay 
32446cf91177SBarry Smith #if defined(PETSC_USE_INFO)
32454e2b4712SSatish Balay     {
3246329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3247ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3248ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3249ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3250ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3251335d9088SBarry Smith       if (diagonal_fill) {
3252ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3253335d9088SBarry Smith       }
32544e2b4712SSatish Balay     }
325563ba0a88SBarry Smith #endif
32564e2b4712SSatish Balay 
32574e2b4712SSatish Balay     /* put together the new matrix */
3258*719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3259*719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3260*719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
3261e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
3262e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
32637c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
3264a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
32654e2b4712SSatish Balay     b->j          = ajnew;
32664e2b4712SSatish Balay     b->i          = ainew;
32674e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
32684e2b4712SSatish Balay     b->diag       = dloc;
32694e2b4712SSatish Balay     b->ilen       = 0;
32704e2b4712SSatish Balay     b->imax       = 0;
32714e2b4712SSatish Balay     b->row        = isrow;
32724e2b4712SSatish Balay     b->col        = iscol;
3273bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3274c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3275c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3276e51c0b9cSSatish Balay     b->icol       = isicol;
327787828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
32784e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
32794e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
3280*719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
32814e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
32824e2b4712SSatish Balay 
3283*719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
3284*719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
3285*719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3286309c388cSBarry Smith   }
3287*719d5645SBarry Smith   ierr = MatSeqBAIJSetNumericFactorization(fact,row_identity && col_identity);CHKERRQ(ierr);
32888661488fSKris Buschelman   PetscFunctionReturn(0);
32898661488fSKris Buschelman }
32908661488fSKris Buschelman 
3291732ee342SKris Buschelman #undef __FUNCT__
32927e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3293dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
32947e7071cdSKris Buschelman {
329512272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
329612272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
32975a9542e3SKris Buschelman   PetscFunctionBegin;
32987cf1b8d3SKris Buschelman   /* Undo Column scaling */
32997cf1b8d3SKris Buschelman /*    while (nz--) { */
33007cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
33017cf1b8d3SKris Buschelman /*    } */
3302c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3303c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
33047cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
33057cf1b8d3SKris Buschelman }
33067cf1b8d3SKris Buschelman 
33077cf1b8d3SKris Buschelman #undef __FUNCT__
33087cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3309dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
33107cf1b8d3SKris Buschelman {
33117cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3312b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
33132aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
33145a9542e3SKris Buschelman   PetscFunctionBegin;
33150b9da03eSKris Buschelman   /* Is this really necessary? */
331620235379SKris Buschelman   while (nz--) {
33170b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
33187e7071cdSKris Buschelman   }
3319c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
33207e7071cdSKris Buschelman   PetscFunctionReturn(0);
33217e7071cdSKris Buschelman }
33227e7071cdSKris Buschelman 
3323732ee342SKris Buschelman 
3324