xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision b988c221ff159da9d5daeaf0f2e9b912f722e89f)
1*b988c221SKris Buschelman /*$Id: baijfact2.c,v 1.61 2001/07/12 23:39:38 buschelm Exp buschelm $*/
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
74e2b4712SSatish Balay #include "src/vec/vecimpl.h"
84e2b4712SSatish Balay #include "src/inline/ilu.h"
974c49faeSBarry Smith #include "src/inline/dot.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17f1af5d2fSBarry Smith   int             *diag = a->diag;
18f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
19f1af5d2fSBarry Smith   Scalar          s1,*x,*b;
20f1af5d2fSBarry Smith 
21f1af5d2fSBarry Smith   PetscFunctionBegin;
22f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24f1af5d2fSBarry Smith 
25f1af5d2fSBarry Smith   /* forward solve the U^T */
26f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
27f1af5d2fSBarry Smith 
28f1af5d2fSBarry Smith     v     = aa + diag[i];
29f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
30f1af5d2fSBarry Smith     s1    = (*v++)*b[i];
31f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
32f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
33f1af5d2fSBarry Smith     while (nz--) {
34f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
35f1af5d2fSBarry Smith     }
36f1af5d2fSBarry Smith     x[i]   = s1;
37f1af5d2fSBarry Smith   }
38f1af5d2fSBarry Smith   /* backward solve the L^T */
39f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
40f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
41f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
42f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
43f1af5d2fSBarry Smith     s1   = x[i];
44f1af5d2fSBarry Smith     while (nz--) {
45f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
46f1af5d2fSBarry Smith     }
47f1af5d2fSBarry Smith   }
48f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
50b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
51f1af5d2fSBarry Smith   PetscFunctionReturn(0);
52f1af5d2fSBarry Smith }
53f1af5d2fSBarry Smith 
544a2ae208SSatish Balay #undef __FUNCT__
554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
567c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57f1af5d2fSBarry Smith {
58f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
61f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
62f1af5d2fSBarry Smith   Scalar          s1,s2,x1,x2;
63f1af5d2fSBarry Smith   Scalar          *x,*b;
64f1af5d2fSBarry Smith 
65f1af5d2fSBarry Smith   PetscFunctionBegin;
66f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
67f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
68f1af5d2fSBarry Smith 
69f1af5d2fSBarry Smith   /* forward solve the U^T */
70f1af5d2fSBarry Smith   idx = 0;
71f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
72f1af5d2fSBarry Smith 
73f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
74f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
75f1af5d2fSBarry Smith     x1 = b[idx];   x2 = b[1+idx];
76f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
77f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
78f1af5d2fSBarry Smith     v += 4;
79f1af5d2fSBarry Smith 
80f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
81f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
82f1af5d2fSBarry Smith     while (nz--) {
83f1af5d2fSBarry Smith       oidx = 2*(*vi++);
84f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
85f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
86f1af5d2fSBarry Smith       v  += 4;
87f1af5d2fSBarry Smith     }
88f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
89f1af5d2fSBarry Smith     idx += 2;
90f1af5d2fSBarry Smith   }
91f1af5d2fSBarry Smith   /* backward solve the L^T */
92f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
93f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
94f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
95f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
96f1af5d2fSBarry Smith     idt  = 2*i;
97f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
98f1af5d2fSBarry Smith     while (nz--) {
99f1af5d2fSBarry Smith       idx   = 2*(*vi--);
100f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
101f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
102f1af5d2fSBarry Smith       v -= 4;
103f1af5d2fSBarry Smith     }
104f1af5d2fSBarry Smith   }
105f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
106f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
107b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
108f1af5d2fSBarry Smith   PetscFunctionReturn(0);
109f1af5d2fSBarry Smith }
110f1af5d2fSBarry Smith 
1114a2ae208SSatish Balay #undef __FUNCT__
1124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
1137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
114f1af5d2fSBarry Smith {
115f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
116f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
117f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
118f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
119f1af5d2fSBarry Smith   Scalar          s1,s2,s3,x1,x2,x3;
120f1af5d2fSBarry Smith   Scalar          *x,*b;
121f1af5d2fSBarry Smith 
122f1af5d2fSBarry Smith   PetscFunctionBegin;
123f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
124f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125f1af5d2fSBarry Smith 
126f1af5d2fSBarry Smith   /* forward solve the U^T */
127f1af5d2fSBarry Smith   idx = 0;
128f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
129f1af5d2fSBarry Smith 
130f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
131f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
132f1af5d2fSBarry Smith     x1 = b[idx];   x2 = b[1+idx]; x3    = b[2+idx];
133f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
134f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
135f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
136f1af5d2fSBarry Smith     v += 9;
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
139f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
140f1af5d2fSBarry Smith     while (nz--) {
141f1af5d2fSBarry Smith       oidx = 3*(*vi++);
142f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
143f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
144f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
145f1af5d2fSBarry Smith       v  += 9;
146f1af5d2fSBarry Smith     }
147f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
148f1af5d2fSBarry Smith     idx += 3;
149f1af5d2fSBarry Smith   }
150f1af5d2fSBarry Smith   /* backward solve the L^T */
151f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
152f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
153f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
154f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
155f1af5d2fSBarry Smith     idt  = 3*i;
156f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
157f1af5d2fSBarry Smith     while (nz--) {
158f1af5d2fSBarry Smith       idx   = 3*(*vi--);
159f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
160f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
161f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
162f1af5d2fSBarry Smith       v -= 9;
163f1af5d2fSBarry Smith     }
164f1af5d2fSBarry Smith   }
165f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
166f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
167b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
168f1af5d2fSBarry Smith   PetscFunctionReturn(0);
169f1af5d2fSBarry Smith }
170f1af5d2fSBarry Smith 
1714a2ae208SSatish Balay #undef __FUNCT__
1724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
1737c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
174f1af5d2fSBarry Smith {
175f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
176f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
177f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
178f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
179f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
180f1af5d2fSBarry Smith   Scalar          *x,*b;
181f1af5d2fSBarry Smith 
182f1af5d2fSBarry Smith   PetscFunctionBegin;
183f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
184f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185f1af5d2fSBarry Smith 
186f1af5d2fSBarry Smith   /* forward solve the U^T */
187f1af5d2fSBarry Smith   idx = 0;
188f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
189f1af5d2fSBarry Smith 
190f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
191f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
192f1af5d2fSBarry Smith     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx];
193f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
194f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
195f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
196f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
197f1af5d2fSBarry Smith     v += 16;
198f1af5d2fSBarry Smith 
199f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
200f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
201f1af5d2fSBarry Smith     while (nz--) {
202f1af5d2fSBarry Smith       oidx = 4*(*vi++);
203f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
204f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
205f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
206f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
207f1af5d2fSBarry Smith       v  += 16;
208f1af5d2fSBarry Smith     }
209f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
210f1af5d2fSBarry Smith     idx += 4;
211f1af5d2fSBarry Smith   }
212f1af5d2fSBarry Smith   /* backward solve the L^T */
213f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
214f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
215f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
216f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
217f1af5d2fSBarry Smith     idt  = 4*i;
218f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
219f1af5d2fSBarry Smith     while (nz--) {
220f1af5d2fSBarry Smith       idx   = 4*(*vi--);
221f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
222f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
223f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
224f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
225f1af5d2fSBarry Smith       v -= 16;
226f1af5d2fSBarry Smith     }
227f1af5d2fSBarry Smith   }
228f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
229f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
230b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
231f1af5d2fSBarry Smith   PetscFunctionReturn(0);
232f1af5d2fSBarry Smith }
233f1af5d2fSBarry Smith 
2344a2ae208SSatish Balay #undef __FUNCT__
2354a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
2367c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
237f1af5d2fSBarry Smith {
238f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
239f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
240f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
241f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
242f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
243f1af5d2fSBarry Smith   Scalar          *x,*b;
244f1af5d2fSBarry Smith 
245f1af5d2fSBarry Smith   PetscFunctionBegin;
246f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
247f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248f1af5d2fSBarry Smith 
249f1af5d2fSBarry Smith   /* forward solve the U^T */
250f1af5d2fSBarry Smith   idx = 0;
251f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
252f1af5d2fSBarry Smith 
253f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
254f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
255f1af5d2fSBarry Smith     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
256f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
257f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
258f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
259f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
260f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
261f1af5d2fSBarry Smith     v += 25;
262f1af5d2fSBarry Smith 
263f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
264f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
265f1af5d2fSBarry Smith     while (nz--) {
266f1af5d2fSBarry Smith       oidx = 5*(*vi++);
267f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
268f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
269f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
270f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
271f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
272f1af5d2fSBarry Smith       v  += 25;
273f1af5d2fSBarry Smith     }
274f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
275f1af5d2fSBarry Smith     idx += 5;
276f1af5d2fSBarry Smith   }
277f1af5d2fSBarry Smith   /* backward solve the L^T */
278f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
279f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
280f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
281f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
282f1af5d2fSBarry Smith     idt  = 5*i;
283f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
284f1af5d2fSBarry Smith     while (nz--) {
285f1af5d2fSBarry Smith       idx   = 5*(*vi--);
286f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
287f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
288f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
289f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
290f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
291f1af5d2fSBarry Smith       v -= 25;
292f1af5d2fSBarry Smith     }
293f1af5d2fSBarry Smith   }
294f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
297f1af5d2fSBarry Smith   PetscFunctionReturn(0);
298f1af5d2fSBarry Smith }
299f1af5d2fSBarry Smith 
3004a2ae208SSatish Balay #undef __FUNCT__
3014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
3027c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
303f1af5d2fSBarry Smith {
304f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
305f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
306f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
307f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
308f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
309f1af5d2fSBarry Smith   Scalar          *x,*b;
310f1af5d2fSBarry Smith 
311f1af5d2fSBarry Smith   PetscFunctionBegin;
312f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
313f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
314f1af5d2fSBarry Smith 
315f1af5d2fSBarry Smith   /* forward solve the U^T */
316f1af5d2fSBarry Smith   idx = 0;
317f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
318f1af5d2fSBarry Smith 
319f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
320f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
321f1af5d2fSBarry Smith     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
322f1af5d2fSBarry Smith     x6    = b[5+idx];
323f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
324f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
325f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
326f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
327f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
328f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
329f1af5d2fSBarry Smith     v += 36;
330f1af5d2fSBarry Smith 
331f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
332f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
333f1af5d2fSBarry Smith     while (nz--) {
334f1af5d2fSBarry Smith       oidx = 6*(*vi++);
335f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
336f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
337f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
338f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
339f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
340f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
341f1af5d2fSBarry Smith       v  += 36;
342f1af5d2fSBarry Smith     }
343f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
344f1af5d2fSBarry Smith     x[5+idx] = s6;
345f1af5d2fSBarry Smith     idx += 6;
346f1af5d2fSBarry Smith   }
347f1af5d2fSBarry Smith   /* backward solve the L^T */
348f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
349f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
350f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
351f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
352f1af5d2fSBarry Smith     idt  = 6*i;
353f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
354f1af5d2fSBarry Smith     s6 = x[5+idt];
355f1af5d2fSBarry Smith     while (nz--) {
356f1af5d2fSBarry Smith       idx   = 6*(*vi--);
357f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
358f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
359f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
360f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
361f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
362f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
363f1af5d2fSBarry Smith       v -= 36;
364f1af5d2fSBarry Smith     }
365f1af5d2fSBarry Smith   }
366f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
367f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
368b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
369f1af5d2fSBarry Smith   PetscFunctionReturn(0);
370f1af5d2fSBarry Smith }
371f1af5d2fSBarry Smith 
3724a2ae208SSatish Balay #undef __FUNCT__
3734a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
3747c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
375f1af5d2fSBarry Smith {
376f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
377f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
378f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
379f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
380f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
381f1af5d2fSBarry Smith   Scalar          *x,*b;
382f1af5d2fSBarry Smith 
383f1af5d2fSBarry Smith   PetscFunctionBegin;
384f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
385f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
386f1af5d2fSBarry Smith 
387f1af5d2fSBarry Smith   /* forward solve the U^T */
388f1af5d2fSBarry Smith   idx = 0;
389f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
390f1af5d2fSBarry Smith 
391f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
392f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
393f1af5d2fSBarry Smith     x1    = b[idx];   x2 = b[1+idx]; x3    = b[2+idx]; x4 = b[3+idx]; x5 = b[4+idx];
394f1af5d2fSBarry Smith     x6    = b[5+idx]; x7 = b[6+idx];
395f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
396f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
397f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
398f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
399f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
400f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
401f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
402f1af5d2fSBarry Smith     v += 49;
403f1af5d2fSBarry Smith 
404f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
405f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
406f1af5d2fSBarry Smith     while (nz--) {
407f1af5d2fSBarry Smith       oidx = 7*(*vi++);
408f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
409f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
410f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
411f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
412f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
413f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
414f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
415f1af5d2fSBarry Smith       v  += 49;
416f1af5d2fSBarry Smith     }
417f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
418f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
419f1af5d2fSBarry Smith     idx += 7;
420f1af5d2fSBarry Smith   }
421f1af5d2fSBarry Smith   /* backward solve the L^T */
422f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
423f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
424f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
425f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
426f1af5d2fSBarry Smith     idt  = 7*i;
427f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
428f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
429f1af5d2fSBarry Smith     while (nz--) {
430f1af5d2fSBarry Smith       idx   = 7*(*vi--);
431f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
432f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
433f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
434f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
435f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
436f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
437f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
438f1af5d2fSBarry Smith       v -= 49;
439f1af5d2fSBarry Smith     }
440f1af5d2fSBarry Smith   }
441f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
442f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
443b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
444f1af5d2fSBarry Smith   PetscFunctionReturn(0);
445f1af5d2fSBarry Smith }
446f1af5d2fSBarry Smith 
447f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4484a2ae208SSatish Balay #undef __FUNCT__
4494a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
4507c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
451f1af5d2fSBarry Smith {
452f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
453f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
454f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
455f1af5d2fSBarry Smith   int             *diag = a->diag;
456f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
457f1af5d2fSBarry Smith   Scalar          s1,*x,*b,*t;
458f1af5d2fSBarry Smith 
459f1af5d2fSBarry Smith   PetscFunctionBegin;
460f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
461f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
462f1af5d2fSBarry Smith   t  = a->solve_work;
463f1af5d2fSBarry Smith 
464f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
465f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
466f1af5d2fSBarry Smith 
467f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
468f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
469f1af5d2fSBarry Smith     t[i] = b[c[i]];
470f1af5d2fSBarry Smith   }
471f1af5d2fSBarry Smith 
472f1af5d2fSBarry Smith   /* forward solve the U^T */
473f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
474f1af5d2fSBarry Smith 
475f1af5d2fSBarry Smith     v     = aa + diag[i];
476f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
477f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
478f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
479f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
480f1af5d2fSBarry Smith     while (nz--) {
481f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
482f1af5d2fSBarry Smith     }
483f1af5d2fSBarry Smith     t[i]   = s1;
484f1af5d2fSBarry Smith   }
485f1af5d2fSBarry Smith   /* backward solve the L^T */
486f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
487f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
488f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
489f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
490f1af5d2fSBarry Smith     s1   = t[i];
491f1af5d2fSBarry Smith     while (nz--) {
492f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
493f1af5d2fSBarry Smith     }
494f1af5d2fSBarry Smith   }
495f1af5d2fSBarry Smith 
496f1af5d2fSBarry Smith   /* copy t into x according to permutation */
497f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
498f1af5d2fSBarry Smith     x[r[i]]   = t[i];
499f1af5d2fSBarry Smith   }
500f1af5d2fSBarry Smith 
501f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
502f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
503f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
504f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
505b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
506f1af5d2fSBarry Smith   PetscFunctionReturn(0);
507f1af5d2fSBarry Smith }
508f1af5d2fSBarry Smith 
5094a2ae208SSatish Balay #undef __FUNCT__
5104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
5117c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
512f1af5d2fSBarry Smith {
513f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
514f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
515f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
516f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
517f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
518f1af5d2fSBarry Smith   Scalar          s1,s2,x1,x2;
519f1af5d2fSBarry Smith   Scalar          *x,*b,*t;
520f1af5d2fSBarry Smith 
521f1af5d2fSBarry Smith   PetscFunctionBegin;
522f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
523f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   t  = a->solve_work;
525f1af5d2fSBarry Smith 
526f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
527f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
528f1af5d2fSBarry Smith 
529f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
530f1af5d2fSBarry Smith   ii = 0;
531f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
532f1af5d2fSBarry Smith     ic      = 2*c[i];
533f1af5d2fSBarry Smith     t[ii]   = b[ic];
534f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
535f1af5d2fSBarry Smith     ii += 2;
536f1af5d2fSBarry Smith   }
537f1af5d2fSBarry Smith 
538f1af5d2fSBarry Smith   /* forward solve the U^T */
539f1af5d2fSBarry Smith   idx = 0;
540f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
541f1af5d2fSBarry Smith 
542f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
543f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
544f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
545f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
546f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
547f1af5d2fSBarry Smith     v += 4;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
550f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
551f1af5d2fSBarry Smith     while (nz--) {
552f1af5d2fSBarry Smith       oidx = 2*(*vi++);
553f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
554f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
555f1af5d2fSBarry Smith       v  += 4;
556f1af5d2fSBarry Smith     }
557f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
558f1af5d2fSBarry Smith     idx += 2;
559f1af5d2fSBarry Smith   }
560f1af5d2fSBarry Smith   /* backward solve the L^T */
561f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
562f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
563f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
564f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
565f1af5d2fSBarry Smith     idt  = 2*i;
566f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
567f1af5d2fSBarry Smith     while (nz--) {
568f1af5d2fSBarry Smith       idx   = 2*(*vi--);
569f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
570f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
571f1af5d2fSBarry Smith       v -= 4;
572f1af5d2fSBarry Smith     }
573f1af5d2fSBarry Smith   }
574f1af5d2fSBarry Smith 
575f1af5d2fSBarry Smith   /* copy t into x according to permutation */
576f1af5d2fSBarry Smith   ii = 0;
577f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
578f1af5d2fSBarry Smith     ir      = 2*r[i];
579f1af5d2fSBarry Smith     x[ir]   = t[ii];
580f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
581f1af5d2fSBarry Smith     ii += 2;
582f1af5d2fSBarry Smith   }
583f1af5d2fSBarry Smith 
584f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
585f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
586f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
587f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
588b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
589f1af5d2fSBarry Smith   PetscFunctionReturn(0);
590f1af5d2fSBarry Smith }
591f1af5d2fSBarry Smith 
5924a2ae208SSatish Balay #undef __FUNCT__
5934a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
5947c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
595f1af5d2fSBarry Smith {
596f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
597f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
598f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
599f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
600f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
601f1af5d2fSBarry Smith   Scalar          s1,s2,s3,x1,x2,x3;
602f1af5d2fSBarry Smith   Scalar          *x,*b,*t;
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   PetscFunctionBegin;
605f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
606f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
607f1af5d2fSBarry Smith   t  = a->solve_work;
608f1af5d2fSBarry Smith 
609f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
610f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
611f1af5d2fSBarry Smith 
612f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
613f1af5d2fSBarry Smith   ii = 0;
614f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
615f1af5d2fSBarry Smith     ic      = 3*c[i];
616f1af5d2fSBarry Smith     t[ii]   = b[ic];
617f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
618f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
619f1af5d2fSBarry Smith     ii += 3;
620f1af5d2fSBarry Smith   }
621f1af5d2fSBarry Smith 
622f1af5d2fSBarry Smith   /* forward solve the U^T */
623f1af5d2fSBarry Smith   idx = 0;
624f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
627f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
628f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
629f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
630f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
631f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
632f1af5d2fSBarry Smith     v += 9;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
635f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
636f1af5d2fSBarry Smith     while (nz--) {
637f1af5d2fSBarry Smith       oidx = 3*(*vi++);
638f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
639f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
640f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
641f1af5d2fSBarry Smith       v  += 9;
642f1af5d2fSBarry Smith     }
643f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
644f1af5d2fSBarry Smith     idx += 3;
645f1af5d2fSBarry Smith   }
646f1af5d2fSBarry Smith   /* backward solve the L^T */
647f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
648f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
649f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
650f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
651f1af5d2fSBarry Smith     idt  = 3*i;
652f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
653f1af5d2fSBarry Smith     while (nz--) {
654f1af5d2fSBarry Smith       idx   = 3*(*vi--);
655f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
656f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
657f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
658f1af5d2fSBarry Smith       v -= 9;
659f1af5d2fSBarry Smith     }
660f1af5d2fSBarry Smith   }
661f1af5d2fSBarry Smith 
662f1af5d2fSBarry Smith   /* copy t into x according to permutation */
663f1af5d2fSBarry Smith   ii = 0;
664f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
665f1af5d2fSBarry Smith     ir      = 3*r[i];
666f1af5d2fSBarry Smith     x[ir]   = t[ii];
667f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
668f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
669f1af5d2fSBarry Smith     ii += 3;
670f1af5d2fSBarry Smith   }
671f1af5d2fSBarry Smith 
672f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
673f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
674f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
675f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
676b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
677f1af5d2fSBarry Smith   PetscFunctionReturn(0);
678f1af5d2fSBarry Smith }
679f1af5d2fSBarry Smith 
6804a2ae208SSatish Balay #undef __FUNCT__
6814a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
6827c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
683f1af5d2fSBarry Smith {
684f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
685f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
686f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
687f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
688f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
689f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,x1,x2,x3,x4;
690f1af5d2fSBarry Smith   Scalar          *x,*b,*t;
691f1af5d2fSBarry Smith 
692f1af5d2fSBarry Smith   PetscFunctionBegin;
693f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
694f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   t  = a->solve_work;
696f1af5d2fSBarry Smith 
697f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
698f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
699f1af5d2fSBarry Smith 
700f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
701f1af5d2fSBarry Smith   ii = 0;
702f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
703f1af5d2fSBarry Smith     ic      = 4*c[i];
704f1af5d2fSBarry Smith     t[ii]   = b[ic];
705f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
706f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
707f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
708f1af5d2fSBarry Smith     ii += 4;
709f1af5d2fSBarry Smith   }
710f1af5d2fSBarry Smith 
711f1af5d2fSBarry Smith   /* forward solve the U^T */
712f1af5d2fSBarry Smith   idx = 0;
713f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
714f1af5d2fSBarry Smith 
715f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
716f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
717f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
718f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
719f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
720f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
721f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
722f1af5d2fSBarry Smith     v += 16;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
725f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
726f1af5d2fSBarry Smith     while (nz--) {
727f1af5d2fSBarry Smith       oidx = 4*(*vi++);
728f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
729f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
730f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
731f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
732f1af5d2fSBarry Smith       v  += 16;
733f1af5d2fSBarry Smith     }
734f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
735f1af5d2fSBarry Smith     idx += 4;
736f1af5d2fSBarry Smith   }
737f1af5d2fSBarry Smith   /* backward solve the L^T */
738f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
739f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
740f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
741f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
742f1af5d2fSBarry Smith     idt  = 4*i;
743f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
744f1af5d2fSBarry Smith     while (nz--) {
745f1af5d2fSBarry Smith       idx   = 4*(*vi--);
746f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
747f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
748f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
749f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
750f1af5d2fSBarry Smith       v -= 16;
751f1af5d2fSBarry Smith     }
752f1af5d2fSBarry Smith   }
753f1af5d2fSBarry Smith 
754f1af5d2fSBarry Smith   /* copy t into x according to permutation */
755f1af5d2fSBarry Smith   ii = 0;
756f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
757f1af5d2fSBarry Smith     ir      = 4*r[i];
758f1af5d2fSBarry Smith     x[ir]   = t[ii];
759f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
760f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
761f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
762f1af5d2fSBarry Smith     ii += 4;
763f1af5d2fSBarry Smith   }
764f1af5d2fSBarry Smith 
765f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
766f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
767f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
768f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
769b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
770f1af5d2fSBarry Smith   PetscFunctionReturn(0);
771f1af5d2fSBarry Smith }
772f1af5d2fSBarry Smith 
7734a2ae208SSatish Balay #undef __FUNCT__
7744a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
7757c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
776f1af5d2fSBarry Smith {
777f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
778f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
779f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
780f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
781f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
782f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
783f1af5d2fSBarry Smith   Scalar          *x,*b,*t;
784f1af5d2fSBarry Smith 
785f1af5d2fSBarry Smith   PetscFunctionBegin;
786f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
787f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
788f1af5d2fSBarry Smith   t  = a->solve_work;
789f1af5d2fSBarry Smith 
790f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
791f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
792f1af5d2fSBarry Smith 
793f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
794f1af5d2fSBarry Smith   ii = 0;
795f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
796f1af5d2fSBarry Smith     ic      = 5*c[i];
797f1af5d2fSBarry Smith     t[ii]   = b[ic];
798f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
799f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
800f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
801f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
802f1af5d2fSBarry Smith     ii += 5;
803f1af5d2fSBarry Smith   }
804f1af5d2fSBarry Smith 
805f1af5d2fSBarry Smith   /* forward solve the U^T */
806f1af5d2fSBarry Smith   idx = 0;
807f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
808f1af5d2fSBarry Smith 
809f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
810f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
811f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
812f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
813f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
814f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
815f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
816f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
817f1af5d2fSBarry Smith     v += 25;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
820f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
821f1af5d2fSBarry Smith     while (nz--) {
822f1af5d2fSBarry Smith       oidx = 5*(*vi++);
823f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
824f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
825f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
826f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
827f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
828f1af5d2fSBarry Smith       v  += 25;
829f1af5d2fSBarry Smith     }
830f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
831f1af5d2fSBarry Smith     idx += 5;
832f1af5d2fSBarry Smith   }
833f1af5d2fSBarry Smith   /* backward solve the L^T */
834f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
835f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
836f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
837f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
838f1af5d2fSBarry Smith     idt  = 5*i;
839f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
840f1af5d2fSBarry Smith     while (nz--) {
841f1af5d2fSBarry Smith       idx   = 5*(*vi--);
842f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
843f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
844f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
845f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
846f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
847f1af5d2fSBarry Smith       v -= 25;
848f1af5d2fSBarry Smith     }
849f1af5d2fSBarry Smith   }
850f1af5d2fSBarry Smith 
851f1af5d2fSBarry Smith   /* copy t into x according to permutation */
852f1af5d2fSBarry Smith   ii = 0;
853f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
854f1af5d2fSBarry Smith     ir      = 5*r[i];
855f1af5d2fSBarry Smith     x[ir]   = t[ii];
856f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
857f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
858f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
859f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
860f1af5d2fSBarry Smith     ii += 5;
861f1af5d2fSBarry Smith   }
862f1af5d2fSBarry Smith 
863f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
864f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
865f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
866f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
867b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
868f1af5d2fSBarry Smith   PetscFunctionReturn(0);
869f1af5d2fSBarry Smith }
870f1af5d2fSBarry Smith 
8714a2ae208SSatish Balay #undef __FUNCT__
8724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
8737c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
874f1af5d2fSBarry Smith {
875f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
876f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
877f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
878f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
879f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
880f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
881f1af5d2fSBarry Smith   Scalar          *x,*b,*t;
882f1af5d2fSBarry Smith 
883f1af5d2fSBarry Smith   PetscFunctionBegin;
884f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
885f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
886f1af5d2fSBarry Smith   t  = a->solve_work;
887f1af5d2fSBarry Smith 
888f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
889f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
890f1af5d2fSBarry Smith 
891f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
892f1af5d2fSBarry Smith   ii = 0;
893f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
894f1af5d2fSBarry Smith     ic      = 6*c[i];
895f1af5d2fSBarry Smith     t[ii]   = b[ic];
896f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
897f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
898f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
899f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
900f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
901f1af5d2fSBarry Smith     ii += 6;
902f1af5d2fSBarry Smith   }
903f1af5d2fSBarry Smith 
904f1af5d2fSBarry Smith   /* forward solve the U^T */
905f1af5d2fSBarry Smith   idx = 0;
906f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
907f1af5d2fSBarry Smith 
908f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
909f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
910f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
911f1af5d2fSBarry Smith     x6    = t[5+idx];
912f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
913f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
914f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
915f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
916f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
917f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
918f1af5d2fSBarry Smith     v += 36;
919f1af5d2fSBarry Smith 
920f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
921f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
922f1af5d2fSBarry Smith     while (nz--) {
923f1af5d2fSBarry Smith       oidx = 6*(*vi++);
924f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
925f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
926f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
927f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
928f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
929f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
930f1af5d2fSBarry Smith       v  += 36;
931f1af5d2fSBarry Smith     }
932f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
933f1af5d2fSBarry Smith     t[5+idx] = s6;
934f1af5d2fSBarry Smith     idx += 6;
935f1af5d2fSBarry Smith   }
936f1af5d2fSBarry Smith   /* backward solve the L^T */
937f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
938f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
939f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
940f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
941f1af5d2fSBarry Smith     idt  = 6*i;
942f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
943f1af5d2fSBarry Smith     s6 = t[5+idt];
944f1af5d2fSBarry Smith     while (nz--) {
945f1af5d2fSBarry Smith       idx   = 6*(*vi--);
946f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
947f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
948f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
949f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
950f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
951f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
952f1af5d2fSBarry Smith       v -= 36;
953f1af5d2fSBarry Smith     }
954f1af5d2fSBarry Smith   }
955f1af5d2fSBarry Smith 
956f1af5d2fSBarry Smith   /* copy t into x according to permutation */
957f1af5d2fSBarry Smith   ii = 0;
958f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
959f1af5d2fSBarry Smith     ir      = 6*r[i];
960f1af5d2fSBarry Smith     x[ir]   = t[ii];
961f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
962f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
963f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
964f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
965f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
966f1af5d2fSBarry Smith     ii += 6;
967f1af5d2fSBarry Smith   }
968f1af5d2fSBarry Smith 
969f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
970f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
971f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
972f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
973b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
974f1af5d2fSBarry Smith   PetscFunctionReturn(0);
975f1af5d2fSBarry Smith }
976f1af5d2fSBarry Smith 
9774a2ae208SSatish Balay #undef __FUNCT__
9784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
9797c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
980f1af5d2fSBarry Smith {
981f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
982f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
983f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
984f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
985f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
986f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
987f1af5d2fSBarry Smith   Scalar          *x,*b,*t;
988f1af5d2fSBarry Smith 
989f1af5d2fSBarry Smith   PetscFunctionBegin;
990f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
991f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
992f1af5d2fSBarry Smith   t  = a->solve_work;
993f1af5d2fSBarry Smith 
994f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
995f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
998f1af5d2fSBarry Smith   ii = 0;
999f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1000f1af5d2fSBarry Smith     ic      = 7*c[i];
1001f1af5d2fSBarry Smith     t[ii]   = b[ic];
1002f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1003f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1004f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1005f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1006f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1007f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1008f1af5d2fSBarry Smith     ii += 7;
1009f1af5d2fSBarry Smith   }
1010f1af5d2fSBarry Smith 
1011f1af5d2fSBarry Smith   /* forward solve the U^T */
1012f1af5d2fSBarry Smith   idx = 0;
1013f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1014f1af5d2fSBarry Smith 
1015f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1016f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1017f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1018f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1019f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1020f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1021f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1022f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1023f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1024f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1025f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1026f1af5d2fSBarry Smith     v += 49;
1027f1af5d2fSBarry Smith 
1028f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1029f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1030f1af5d2fSBarry Smith     while (nz--) {
1031f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1032f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1033f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1034f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1035f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1036f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1037f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1038f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1039f1af5d2fSBarry Smith       v  += 49;
1040f1af5d2fSBarry Smith     }
1041f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1042f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1043f1af5d2fSBarry Smith     idx += 7;
1044f1af5d2fSBarry Smith   }
1045f1af5d2fSBarry Smith   /* backward solve the L^T */
1046f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1047f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1048f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1049f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1050f1af5d2fSBarry Smith     idt  = 7*i;
1051f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1052f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1053f1af5d2fSBarry Smith     while (nz--) {
1054f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1055f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1056f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1057f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1058f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1059f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1060f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1061f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1062f1af5d2fSBarry Smith       v -= 49;
1063f1af5d2fSBarry Smith     }
1064f1af5d2fSBarry Smith   }
1065f1af5d2fSBarry Smith 
1066f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1067f1af5d2fSBarry Smith   ii = 0;
1068f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1069f1af5d2fSBarry Smith     ir      = 7*r[i];
1070f1af5d2fSBarry Smith     x[ir]   = t[ii];
1071f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1072f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1073f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1074f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1075f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1076f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1077f1af5d2fSBarry Smith     ii += 7;
1078f1af5d2fSBarry Smith   }
1079f1af5d2fSBarry Smith 
1080f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1081f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1082f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1083f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1084b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1085f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1086f1af5d2fSBarry Smith }
1087f1af5d2fSBarry Smith 
10884e2b4712SSatish Balay /* ----------------------------------------------------------- */
10894a2ae208SSatish Balay #undef __FUNCT__
10904a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
10914e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
10924e2b4712SSatish Balay {
10934e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
10944e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
10954e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
10964e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
10973f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
1098f1af5d2fSBarry Smith   Scalar          *x,*b,*s,*t,*ls;
10994e2b4712SSatish Balay 
11004e2b4712SSatish Balay   PetscFunctionBegin;
1101e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1102e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1103f1af5d2fSBarry Smith   t  = a->solve_work;
11044e2b4712SSatish Balay 
11054e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11064e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11074e2b4712SSatish Balay 
11084e2b4712SSatish Balay   /* forward solve the lower triangular */
1109f1af5d2fSBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
11104e2b4712SSatish Balay   for (i=1; i<n; i++) {
11114e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11124e2b4712SSatish Balay     vi  = aj + ai[i];
11134e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1114f1af5d2fSBarry Smith     s = t + bs*i;
1115f1af5d2fSBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(Scalar));CHKERRQ(ierr);
11164e2b4712SSatish Balay     while (nz--) {
1117f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11184e2b4712SSatish Balay       v += bs2;
11194e2b4712SSatish Balay     }
11204e2b4712SSatish Balay   }
11214e2b4712SSatish Balay   /* backward solve the upper triangular */
1122273d9f13SBarry Smith   ls = a->solve_work + A->n;
11234e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11244e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11254e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11264e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
1127f1af5d2fSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
11284e2b4712SSatish Balay     while (nz--) {
1129f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11304e2b4712SSatish Balay       v += bs2;
11314e2b4712SSatish Balay     }
1132f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
1133f1af5d2fSBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(Scalar));CHKERRQ(ierr);
11344e2b4712SSatish Balay   }
11354e2b4712SSatish Balay 
11364e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11374e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1138e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1139e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1140b0a32e0cSBarry Smith   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
11414e2b4712SSatish Balay   PetscFunctionReturn(0);
11424e2b4712SSatish Balay }
11434e2b4712SSatish Balay 
11444a2ae208SSatish Balay #undef __FUNCT__
11454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
11464e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11474e2b4712SSatish Balay {
11484e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11494e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11504e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
11514e2b4712SSatish Balay   int             *diag = a->diag;
11523f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
1153f1af5d2fSBarry Smith   Scalar          s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1154f1af5d2fSBarry Smith   Scalar          *x,*b,*t;
11554e2b4712SSatish Balay 
11564e2b4712SSatish Balay   PetscFunctionBegin;
1157e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1158e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1159f1af5d2fSBarry Smith   t  = a->solve_work;
11604e2b4712SSatish Balay 
11614e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11624e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11634e2b4712SSatish Balay 
11644e2b4712SSatish Balay   /* forward solve the lower triangular */
11654e2b4712SSatish Balay   idx    = 7*(*r++);
1166f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1167f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1168f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11694e2b4712SSatish Balay 
11704e2b4712SSatish Balay   for (i=1; i<n; i++) {
11714e2b4712SSatish Balay     v     = aa + 49*ai[i];
11724e2b4712SSatish Balay     vi    = aj + ai[i];
11734e2b4712SSatish Balay     nz    = diag[i] - ai[i];
11744e2b4712SSatish Balay     idx   = 7*(*r++);
1175f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1176f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
11774e2b4712SSatish Balay     while (nz--) {
11784e2b4712SSatish Balay       idx   = 7*(*vi++);
1179f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1180f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1181f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1182f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1183f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1184f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1185f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1186f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1187f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1188f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
11894e2b4712SSatish Balay       v += 49;
11904e2b4712SSatish Balay     }
11914e2b4712SSatish Balay     idx = 7*i;
1192f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1193f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1194f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
11954e2b4712SSatish Balay   }
11964e2b4712SSatish Balay   /* backward solve the upper triangular */
11974e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11984e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
11994e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12004e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12014e2b4712SSatish Balay     idt  = 7*i;
1202f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1203f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1204f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12054e2b4712SSatish Balay     while (nz--) {
12064e2b4712SSatish Balay       idx   = 7*(*vi++);
1207f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1208f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1209f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1210f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1211f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1212f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1213f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1214f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1215f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1216f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12174e2b4712SSatish Balay       v += 49;
12184e2b4712SSatish Balay     }
12194e2b4712SSatish Balay     idc = 7*(*c--);
12204e2b4712SSatish Balay     v   = aa + 49*diag[i];
1221f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1222f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1223f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1224f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1225f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1226f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1227f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1228f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1229f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1230f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1231f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1232f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1233f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1234f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12354e2b4712SSatish Balay   }
12364e2b4712SSatish Balay 
12374e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12384e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1239e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1240e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1241b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
12424e2b4712SSatish Balay   PetscFunctionReturn(0);
12434e2b4712SSatish Balay }
12444e2b4712SSatish Balay 
12454a2ae208SSatish Balay #undef __FUNCT__
12464a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
124715091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
124815091d37SBarry Smith {
124915091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
125015091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
125115091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
125215091d37SBarry Smith   MatScalar       *aa=a->a,*v;
1253f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
125415091d37SBarry Smith 
125515091d37SBarry Smith   PetscFunctionBegin;
125615091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
125715091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
125815091d37SBarry Smith   /* forward solve the lower triangular */
125915091d37SBarry Smith   idx    = 0;
126015091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
126115091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
126215091d37SBarry Smith   x[6] = b[6+idx];
126315091d37SBarry Smith   for (i=1; i<n; i++) {
126415091d37SBarry Smith     v     =  aa + 49*ai[i];
126515091d37SBarry Smith     vi    =  aj + ai[i];
126615091d37SBarry Smith     nz    =  diag[i] - ai[i];
126715091d37SBarry Smith     idx   =  7*i;
1268f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1269f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1270f1af5d2fSBarry Smith     s7  =  b[6+idx];
127115091d37SBarry Smith     while (nz--) {
127215091d37SBarry Smith       jdx   = 7*(*vi++);
127315091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
127415091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
127515091d37SBarry Smith       x7    = x[6+jdx];
1276f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1277f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1278f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1279f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1280f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1281f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1282f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
128315091d37SBarry Smith       v += 49;
128415091d37SBarry Smith      }
1285f1af5d2fSBarry Smith     x[idx]   = s1;
1286f1af5d2fSBarry Smith     x[1+idx] = s2;
1287f1af5d2fSBarry Smith     x[2+idx] = s3;
1288f1af5d2fSBarry Smith     x[3+idx] = s4;
1289f1af5d2fSBarry Smith     x[4+idx] = s5;
1290f1af5d2fSBarry Smith     x[5+idx] = s6;
1291f1af5d2fSBarry Smith     x[6+idx] = s7;
129215091d37SBarry Smith   }
129315091d37SBarry Smith   /* backward solve the upper triangular */
129415091d37SBarry Smith   for (i=n-1; i>=0; i--){
129515091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
129615091d37SBarry Smith     vi   = aj + diag[i] + 1;
129715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
129815091d37SBarry Smith     idt  = 7*i;
1299f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1300f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1301f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1302f1af5d2fSBarry Smith     s7 = x[6+idt];
130315091d37SBarry Smith     while (nz--) {
130415091d37SBarry Smith       idx   = 7*(*vi++);
130515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
130615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
130715091d37SBarry Smith       x7    = x[6+idx];
1308f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
131515091d37SBarry Smith       v += 49;
131615091d37SBarry Smith     }
131715091d37SBarry Smith     v        = aa + 49*diag[i];
1318f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1319f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1320f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1321f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1322f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1323f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1324f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1325f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1326f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1327f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1328f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1329f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1330f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1331f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
133215091d37SBarry Smith   }
133315091d37SBarry Smith 
133415091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
133515091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1336b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
133715091d37SBarry Smith   PetscFunctionReturn(0);
133815091d37SBarry Smith }
133915091d37SBarry Smith 
13404a2ae208SSatish Balay #undef __FUNCT__
13414a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
134215091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
134315091d37SBarry Smith {
134415091d37SBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
134515091d37SBarry Smith   IS              iscol=a->col,isrow=a->row;
134615091d37SBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
134715091d37SBarry Smith   int             *diag = a->diag;
134815091d37SBarry Smith   MatScalar       *aa=a->a,*v;
1349f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
135015091d37SBarry Smith 
135115091d37SBarry Smith   PetscFunctionBegin;
135215091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
135315091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1354f1af5d2fSBarry Smith   t  = a->solve_work;
135515091d37SBarry Smith 
135615091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
135715091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
135815091d37SBarry Smith 
135915091d37SBarry Smith   /* forward solve the lower triangular */
136015091d37SBarry Smith   idx    = 6*(*r++);
1361f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1362f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1363f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
136415091d37SBarry Smith   for (i=1; i<n; i++) {
136515091d37SBarry Smith     v     = aa + 36*ai[i];
136615091d37SBarry Smith     vi    = aj + ai[i];
136715091d37SBarry Smith     nz    = diag[i] - ai[i];
136815091d37SBarry Smith     idx   = 6*(*r++);
1369f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1370f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
137115091d37SBarry Smith     while (nz--) {
137215091d37SBarry Smith       idx   = 6*(*vi++);
1373f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1374f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1375f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1376f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1377f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1378f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1379f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1380f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
138115091d37SBarry Smith       v += 36;
138215091d37SBarry Smith     }
138315091d37SBarry Smith     idx = 6*i;
1384f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1385f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1386f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
138715091d37SBarry Smith   }
138815091d37SBarry Smith   /* backward solve the upper triangular */
138915091d37SBarry Smith   for (i=n-1; i>=0; i--){
139015091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
139115091d37SBarry Smith     vi   = aj + diag[i] + 1;
139215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
139315091d37SBarry Smith     idt  = 6*i;
1394f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1395f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1396f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
139715091d37SBarry Smith     while (nz--) {
139815091d37SBarry Smith       idx   = 6*(*vi++);
1399f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1400f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1401f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1402f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1403f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1404f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1405f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1406f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1407f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
140815091d37SBarry Smith       v += 36;
140915091d37SBarry Smith     }
141015091d37SBarry Smith     idc = 6*(*c--);
141115091d37SBarry Smith     v   = aa + 36*diag[i];
1412f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1413f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1414f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1415f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1416f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1417f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1418f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1419f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1420f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1421f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1422f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1423f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
142415091d37SBarry Smith   }
142515091d37SBarry Smith 
142615091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
142715091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
142815091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
142915091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1430b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
143115091d37SBarry Smith   PetscFunctionReturn(0);
143215091d37SBarry Smith }
143315091d37SBarry Smith 
14344a2ae208SSatish Balay #undef __FUNCT__
14354a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
143615091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
143715091d37SBarry Smith {
143815091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
143915091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
144015091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
144115091d37SBarry Smith   MatScalar       *aa=a->a,*v;
1442f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
144315091d37SBarry Smith 
144415091d37SBarry Smith   PetscFunctionBegin;
144515091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
144615091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
144715091d37SBarry Smith   /* forward solve the lower triangular */
144815091d37SBarry Smith   idx    = 0;
144915091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
145015091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
145115091d37SBarry Smith   for (i=1; i<n; i++) {
145215091d37SBarry Smith     v     =  aa + 36*ai[i];
145315091d37SBarry Smith     vi    =  aj + ai[i];
145415091d37SBarry Smith     nz    =  diag[i] - ai[i];
145515091d37SBarry Smith     idx   =  6*i;
1456f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1457f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
145815091d37SBarry Smith     while (nz--) {
145915091d37SBarry Smith       jdx   = 6*(*vi++);
146015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
146115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1462f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1463f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1464f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1465f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1466f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1467f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
146815091d37SBarry Smith       v += 36;
146915091d37SBarry Smith      }
1470f1af5d2fSBarry Smith     x[idx]   = s1;
1471f1af5d2fSBarry Smith     x[1+idx] = s2;
1472f1af5d2fSBarry Smith     x[2+idx] = s3;
1473f1af5d2fSBarry Smith     x[3+idx] = s4;
1474f1af5d2fSBarry Smith     x[4+idx] = s5;
1475f1af5d2fSBarry Smith     x[5+idx] = s6;
147615091d37SBarry Smith   }
147715091d37SBarry Smith   /* backward solve the upper triangular */
147815091d37SBarry Smith   for (i=n-1; i>=0; i--){
147915091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
148015091d37SBarry Smith     vi   = aj + diag[i] + 1;
148115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
148215091d37SBarry Smith     idt  = 6*i;
1483f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1484f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1485f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
148615091d37SBarry Smith     while (nz--) {
148715091d37SBarry Smith       idx   = 6*(*vi++);
148815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
148915091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1490f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1491f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1492f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1493f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1494f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1495f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
149615091d37SBarry Smith       v += 36;
149715091d37SBarry Smith     }
149815091d37SBarry Smith     v        = aa + 36*diag[i];
1499f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1500f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1501f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1502f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1503f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1504f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
150515091d37SBarry Smith   }
150615091d37SBarry Smith 
150715091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
150815091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1509b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
151015091d37SBarry Smith   PetscFunctionReturn(0);
151115091d37SBarry Smith }
151215091d37SBarry Smith 
15134a2ae208SSatish Balay #undef __FUNCT__
15144a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
15154e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15164e2b4712SSatish Balay {
15174e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15184e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
15194e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
15204e2b4712SSatish Balay   int             *diag = a->diag;
15213f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
1522f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
15234e2b4712SSatish Balay 
15244e2b4712SSatish Balay   PetscFunctionBegin;
1525e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1526e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1527f1af5d2fSBarry Smith   t  = a->solve_work;
15284e2b4712SSatish Balay 
15294e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15304e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15314e2b4712SSatish Balay 
15324e2b4712SSatish Balay   /* forward solve the lower triangular */
15334e2b4712SSatish Balay   idx    = 5*(*r++);
1534f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1535f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15364e2b4712SSatish Balay   for (i=1; i<n; i++) {
15374e2b4712SSatish Balay     v     = aa + 25*ai[i];
15384e2b4712SSatish Balay     vi    = aj + ai[i];
15394e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15404e2b4712SSatish Balay     idx   = 5*(*r++);
1541f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1542f1af5d2fSBarry Smith     s5  = b[4+idx];
15434e2b4712SSatish Balay     while (nz--) {
15444e2b4712SSatish Balay       idx   = 5*(*vi++);
1545f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1546f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1547f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1548f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1549f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1550f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1551f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15524e2b4712SSatish Balay       v += 25;
15534e2b4712SSatish Balay     }
15544e2b4712SSatish Balay     idx = 5*i;
1555f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1556f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15574e2b4712SSatish Balay   }
15584e2b4712SSatish Balay   /* backward solve the upper triangular */
15594e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15604e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15614e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15624e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
15634e2b4712SSatish Balay     idt  = 5*i;
1564f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1565f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
15664e2b4712SSatish Balay     while (nz--) {
15674e2b4712SSatish Balay       idx   = 5*(*vi++);
1568f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1569f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1570f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1571f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1572f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1573f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1574f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15754e2b4712SSatish Balay       v += 25;
15764e2b4712SSatish Balay     }
15774e2b4712SSatish Balay     idc = 5*(*c--);
15784e2b4712SSatish Balay     v   = aa + 25*diag[i];
1579f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1580f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1581f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1582f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1583f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1584f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1585f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1586f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1587f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1588f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
15894e2b4712SSatish Balay   }
15904e2b4712SSatish Balay 
15914e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
15924e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1593e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1594e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1595b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
15964e2b4712SSatish Balay   PetscFunctionReturn(0);
15974e2b4712SSatish Balay }
15984e2b4712SSatish Balay 
15994a2ae208SSatish Balay #undef __FUNCT__
16004a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
160115091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
160215091d37SBarry Smith {
160315091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
160415091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
160515091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
160615091d37SBarry Smith   MatScalar       *aa=a->a,*v;
1607329f5518SBarry Smith   Scalar          *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
160815091d37SBarry Smith 
160915091d37SBarry Smith   PetscFunctionBegin;
161015091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
161115091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
161215091d37SBarry Smith   /* forward solve the lower triangular */
161315091d37SBarry Smith   idx    = 0;
161415091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
161515091d37SBarry Smith   for (i=1; i<n; i++) {
161615091d37SBarry Smith     v     =  aa + 25*ai[i];
161715091d37SBarry Smith     vi    =  aj + ai[i];
161815091d37SBarry Smith     nz    =  diag[i] - ai[i];
161915091d37SBarry Smith     idx   =  5*i;
1620f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
162115091d37SBarry Smith     while (nz--) {
162215091d37SBarry Smith       jdx   = 5*(*vi++);
162315091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1624f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1625f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1626f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1627f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1628f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
162915091d37SBarry Smith       v    += 25;
163015091d37SBarry Smith     }
1631f1af5d2fSBarry Smith     x[idx]   = s1;
1632f1af5d2fSBarry Smith     x[1+idx] = s2;
1633f1af5d2fSBarry Smith     x[2+idx] = s3;
1634f1af5d2fSBarry Smith     x[3+idx] = s4;
1635f1af5d2fSBarry Smith     x[4+idx] = s5;
163615091d37SBarry Smith   }
163715091d37SBarry Smith   /* backward solve the upper triangular */
163815091d37SBarry Smith   for (i=n-1; i>=0; i--){
163915091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
164015091d37SBarry Smith     vi   = aj + diag[i] + 1;
164115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
164215091d37SBarry Smith     idt  = 5*i;
1643f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1644f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
164515091d37SBarry Smith     while (nz--) {
164615091d37SBarry Smith       idx   = 5*(*vi++);
164715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1648f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1649f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1650f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1651f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1652f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
165315091d37SBarry Smith       v    += 25;
165415091d37SBarry Smith     }
165515091d37SBarry Smith     v        = aa + 25*diag[i];
1656f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1657f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1658f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1659f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1660f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
166115091d37SBarry Smith   }
166215091d37SBarry Smith 
166315091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
166415091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1665b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
166615091d37SBarry Smith   PetscFunctionReturn(0);
166715091d37SBarry Smith }
166815091d37SBarry Smith 
16694a2ae208SSatish Balay #undef __FUNCT__
16704a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
16714e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
16724e2b4712SSatish Balay {
16734e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
16744e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
16754e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
16764e2b4712SSatish Balay   int             *diag = a->diag;
16773f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
1678f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
16794e2b4712SSatish Balay 
16804e2b4712SSatish Balay   PetscFunctionBegin;
1681e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1682e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1683f1af5d2fSBarry Smith   t  = a->solve_work;
16844e2b4712SSatish Balay 
16854e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
16864e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
16874e2b4712SSatish Balay 
16884e2b4712SSatish Balay   /* forward solve the lower triangular */
16894e2b4712SSatish Balay   idx    = 4*(*r++);
1690f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1691f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
16924e2b4712SSatish Balay   for (i=1; i<n; i++) {
16934e2b4712SSatish Balay     v     = aa + 16*ai[i];
16944e2b4712SSatish Balay     vi    = aj + ai[i];
16954e2b4712SSatish Balay     nz    = diag[i] - ai[i];
16964e2b4712SSatish Balay     idx   = 4*(*r++);
1697f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
16984e2b4712SSatish Balay     while (nz--) {
16994e2b4712SSatish Balay       idx   = 4*(*vi++);
1700f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1701f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1702f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1703f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1704f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17054e2b4712SSatish Balay       v    += 16;
17064e2b4712SSatish Balay     }
17074e2b4712SSatish Balay     idx        = 4*i;
1708f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1709f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17104e2b4712SSatish Balay   }
17114e2b4712SSatish Balay   /* backward solve the upper triangular */
17124e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17134e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17144e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17154e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17164e2b4712SSatish Balay     idt  = 4*i;
1717f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1718f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17194e2b4712SSatish Balay     while (nz--) {
17204e2b4712SSatish Balay       idx   = 4*(*vi++);
1721f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1722f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1723f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1724f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1725f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1726f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17274e2b4712SSatish Balay       v += 16;
17284e2b4712SSatish Balay     }
17294e2b4712SSatish Balay     idc      = 4*(*c--);
17304e2b4712SSatish Balay     v        = aa + 16*diag[i];
1731f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1732f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1733f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1734f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17354e2b4712SSatish Balay   }
17364e2b4712SSatish Balay 
17374e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17384e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1739e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1740e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
17424e2b4712SSatish Balay   PetscFunctionReturn(0);
17434e2b4712SSatish Balay }
174424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
174524c233c2SKris Buschelman 
174624c233c2SKris Buschelman #include PETSC_HAVE_SSE
174724c233c2SKris Buschelman 
174824c233c2SKris Buschelman #undef __FUNCT__
174924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
175024c233c2SKris Buschelman int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
175124c233c2SKris Buschelman {
175224c233c2SKris Buschelman   /*
175324c233c2SKris Buschelman      Note: This code uses demotion of double
175424c233c2SKris Buschelman      to float when performing the mixed-mode computation.
175524c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
175624c233c2SKris Buschelman   */
175724c233c2SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
175824c233c2SKris Buschelman   IS              iscol=a->col,isrow=a->row;
175924c233c2SKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
176024c233c2SKris Buschelman   int             *diag = a->diag,ai16;
176124c233c2SKris Buschelman   MatScalar       *aa=a->a,*v;
176224c233c2SKris Buschelman   Scalar          *x,*b,*t;
176324c233c2SKris Buschelman 
176424c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
176524c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
176624c233c2SKris Buschelman   unsigned long   offset;
176724c233c2SKris Buschelman 
176824c233c2SKris Buschelman   PetscFunctionBegin;
176924c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
177024c233c2SKris Buschelman 
177124c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
177224c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
177324c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
177424c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
177524c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
177624c233c2SKris Buschelman 
177724c233c2SKris Buschelman     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
177824c233c2SKris Buschelman     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177924c233c2SKris Buschelman     t  = a->solve_work;
178024c233c2SKris Buschelman 
178124c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
178224c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
178324c233c2SKris Buschelman 
178424c233c2SKris Buschelman     /* forward solve the lower triangular */
178524c233c2SKris Buschelman     idx  = 4*(*r++);
178624c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
178724c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
178824c233c2SKris Buschelman     v    =  aa + 16*ai[1];
178924c233c2SKris Buschelman 
179024c233c2SKris Buschelman     for (i=1; i<n;) {
179124c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
179224c233c2SKris Buschelman       vi   =  aj      + ai[i];
179324c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
179424c233c2SKris Buschelman       idx  =  4*(*r++);
179524c233c2SKris Buschelman 
179624c233c2SKris Buschelman       /* Demote sum from double to float */
179724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
179824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
179924c233c2SKris Buschelman 
180024c233c2SKris Buschelman       while (nz--) {
180124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
180224c233c2SKris Buschelman         idx = 4*(*vi++);
180324c233c2SKris Buschelman 
180424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
180524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
180624c233c2SKris Buschelman 
180724c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
180824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
180924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
181024c233c2SKris Buschelman 
181124c233c2SKris Buschelman           /* First Column */
181224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
181324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
181424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
181524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
181624c233c2SKris Buschelman 
181724c233c2SKris Buschelman           /* Second Column */
181824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
181924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
182024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
182124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
182224c233c2SKris Buschelman 
182324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
182424c233c2SKris Buschelman 
182524c233c2SKris Buschelman           /* Third Column */
182624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
182724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
182824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
182924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
183024c233c2SKris Buschelman 
183124c233c2SKris Buschelman           /* Fourth Column */
183224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
183324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
183424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
183524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
183624c233c2SKris Buschelman         SSE_INLINE_END_2
183724c233c2SKris Buschelman 
183824c233c2SKris Buschelman         v  += 16;
183924c233c2SKris Buschelman       }
184024c233c2SKris Buschelman       idx = 4*i;
184124c233c2SKris Buschelman       v   = aa + 16*ai[++i];
184224c233c2SKris Buschelman       PREFETCH_NTA(v);
184324c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
184424c233c2SKris Buschelman 
184524c233c2SKris Buschelman       /* Promote result from float to double */
184624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
184724c233c2SKris Buschelman     }
184824c233c2SKris Buschelman     /* backward solve the upper triangular */
184924c233c2SKris Buschelman     idt  = 4*(n-1);
185024c233c2SKris Buschelman     ai16 = 16*diag[n-1];
185124c233c2SKris Buschelman     v    = aa + ai16 + 16;
185224c233c2SKris Buschelman     for (i=n-1; i>=0;){
185324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
185424c233c2SKris Buschelman       vi = aj + diag[i] + 1;
185524c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
185624c233c2SKris Buschelman 
185724c233c2SKris Buschelman       /* Demote accumulator from double to float */
185824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
185924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
186024c233c2SKris Buschelman 
186124c233c2SKris Buschelman       while (nz--) {
186224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
186324c233c2SKris Buschelman         idx = 4*(*vi++);
186424c233c2SKris Buschelman 
186524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
186624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
186724c233c2SKris Buschelman 
186824c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
186924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
187024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
187124c233c2SKris Buschelman 
187224c233c2SKris Buschelman           /* First Column */
187324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
187424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
187524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
187624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
187724c233c2SKris Buschelman 
187824c233c2SKris Buschelman           /* Second Column */
187924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
188024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
188124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
188224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
188324c233c2SKris Buschelman 
188424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
188524c233c2SKris Buschelman 
188624c233c2SKris Buschelman           /* Third Column */
188724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
188824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
188924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
189024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
189124c233c2SKris Buschelman 
189224c233c2SKris Buschelman           /* Fourth Column */
189324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
189424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
189524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
189624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
189724c233c2SKris Buschelman         SSE_INLINE_END_2
189824c233c2SKris Buschelman         v  += 16;
189924c233c2SKris Buschelman       }
190024c233c2SKris Buschelman       v    = aa + ai16;
190124c233c2SKris Buschelman       ai16 = 16*diag[--i];
190224c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
190324c233c2SKris Buschelman       /*
190424c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
190524c233c2SKris Buschelman          which was inverted as part of the factorization
190624c233c2SKris Buschelman       */
190724c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
190824c233c2SKris Buschelman         /* First Column */
190924c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
191024c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
191124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
191224c233c2SKris Buschelman 
191324c233c2SKris Buschelman         /* Second Column */
191424c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
191524c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
191624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
191724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
191824c233c2SKris Buschelman 
191924c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
192024c233c2SKris Buschelman 
192124c233c2SKris Buschelman         /* Third Column */
192224c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
192324c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
192424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
192524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
192624c233c2SKris Buschelman 
192724c233c2SKris Buschelman         /* Fourth Column */
192824c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
192924c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
193024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
193124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
193224c233c2SKris Buschelman 
193324c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
193424c233c2SKris Buschelman       SSE_INLINE_END_3
193524c233c2SKris Buschelman 
193624c233c2SKris Buschelman       /* Promote solution from float to double */
193724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
193824c233c2SKris Buschelman 
193924c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
194024c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
194124c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
194224c233c2SKris Buschelman       idc  = 4*(*c--);
194324c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
194424c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
194524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
194624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
194724c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
194824c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
194924c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
195024c233c2SKris Buschelman       SSE_INLINE_END_2
195124c233c2SKris Buschelman       v    = aa + ai16 + 16;
195224c233c2SKris Buschelman       idt -= 4;
195324c233c2SKris Buschelman     }
195424c233c2SKris Buschelman 
195524c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
195624c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
195724c233c2SKris Buschelman     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
195824c233c2SKris Buschelman     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
195924c233c2SKris Buschelman     PetscLogFlops(2*16*(a->nz) - 4*A->n);
196024c233c2SKris Buschelman   SSE_SCOPE_END;
196124c233c2SKris Buschelman   PetscFunctionReturn(0);
196224c233c2SKris Buschelman }
196324c233c2SKris Buschelman 
196424c233c2SKris Buschelman #endif
19650ef38995SBarry Smith 
19660ef38995SBarry Smith 
19674e2b4712SSatish Balay /*
19684e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
19694e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
19704e2b4712SSatish Balay */
19714a2ae208SSatish Balay #undef __FUNCT__
19724a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
19734e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
19744e2b4712SSatish Balay {
19754e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
197630d4dcafSBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
197730d4dcafSBarry Smith   int             ierr,*diag = a->diag;
19783f1db9ecSBarry Smith   MatScalar       *aa=a->a;
197930d4dcafSBarry Smith   Scalar          *x,*b;
19804e2b4712SSatish Balay 
19814e2b4712SSatish Balay   PetscFunctionBegin;
1982e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1983e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
19844e2b4712SSatish Balay 
1985aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
19862853dc0eSBarry Smith   {
19872853dc0eSBarry Smith     static Scalar w[2000]; /* very BAD need to fix */
19882853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
19892853dc0eSBarry Smith   }
1990aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
19912853dc0eSBarry Smith   {
19922853dc0eSBarry Smith     static Scalar w[2000]; /* very BAD need to fix */
19932853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
19942853dc0eSBarry Smith   }
1995aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
19962853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
1997e1293385SBarry Smith #else
199830d4dcafSBarry Smith   {
1999f1af5d2fSBarry Smith     Scalar    s1,s2,s3,s4,x1,x2,x3,x4;
20003f1db9ecSBarry Smith     MatScalar *v;
20014e555682SBarry Smith     int       jdx,idt,idx,nz,*vi,i,ai16;
2002e1293385SBarry Smith 
20034e2b4712SSatish Balay   /* forward solve the lower triangular */
20044e2b4712SSatish Balay   idx    = 0;
2005e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
20064e2b4712SSatish Balay   for (i=1; i<n; i++) {
20074e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
20084e2b4712SSatish Balay     vi    =  aj      + ai[i];
20094e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2010e1293385SBarry Smith     idx   +=  4;
2011f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
20124e2b4712SSatish Balay     while (nz--) {
20134e2b4712SSatish Balay       jdx   = 4*(*vi++);
20144e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2015f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2016f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2017f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2018f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
20194e2b4712SSatish Balay       v    += 16;
20204e2b4712SSatish Balay     }
2021f1af5d2fSBarry Smith     x[idx]   = s1;
2022f1af5d2fSBarry Smith     x[1+idx] = s2;
2023f1af5d2fSBarry Smith     x[2+idx] = s3;
2024f1af5d2fSBarry Smith     x[3+idx] = s4;
20254e2b4712SSatish Balay   }
20264e2b4712SSatish Balay   /* backward solve the upper triangular */
20274e555682SBarry Smith   idt = 4*(n-1);
20284e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
20294e555682SBarry Smith     ai16 = 16*diag[i];
20304e555682SBarry Smith     v    = aa + ai16 + 16;
20314e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
20324e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2033f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2034f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
20354e2b4712SSatish Balay     while (nz--) {
20364e2b4712SSatish Balay       idx   = 4*(*vi++);
20374e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2038f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2039f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2040f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2041f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
20424e2b4712SSatish Balay       v    += 16;
20434e2b4712SSatish Balay     }
20444e555682SBarry Smith     v        = aa + ai16;
2045f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2046f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2047f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2048f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2049329f5518SBarry Smith     idt -= 4;
20504e2b4712SSatish Balay   }
205130d4dcafSBarry Smith   }
2052e1293385SBarry Smith #endif
20534e2b4712SSatish Balay 
2054e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2055e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2056b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
20574e2b4712SSatish Balay   PetscFunctionReturn(0);
20584e2b4712SSatish Balay }
20594e2b4712SSatish Balay 
20603660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
20613660e330SKris Buschelman 
20623660e330SKris Buschelman #include PETSC_HAVE_SSE
20633660e330SKris Buschelman 
20643660e330SKris Buschelman #undef __FUNCT__
20653660e330SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
20663660e330SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
20673660e330SKris Buschelman {
20683660e330SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
20693660e330SKris Buschelman   int             n=a->mbs,*ai=a->i,*aj=a->j;
20703660e330SKris Buschelman   int             ierr,*diag = a->diag;
20713660e330SKris Buschelman   MatScalar       *aa=a->a;
20723660e330SKris Buschelman   Scalar          *x,*b;
20733660e330SKris Buschelman 
20743660e330SKris Buschelman   PetscFunctionBegin;
20753660e330SKris Buschelman   SSE_SCOPE_BEGIN;
20763660e330SKris Buschelman   /*
20773660e330SKris Buschelman      Note: This code currently uses demotion of double
20783660e330SKris Buschelman      to float when performing the mixed-mode computation.
20793660e330SKris Buschelman      This may not be numerically reasonable for all applications.
20803660e330SKris Buschelman   */
20813660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
20823660e330SKris Buschelman 
20833660e330SKris Buschelman   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
20843660e330SKris Buschelman   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
20853660e330SKris Buschelman   {
20863660e330SKris Buschelman     MatScalar     *v;
20873660e330SKris Buschelman     int           jdx,idt,idx,nz,*vi,i,ai16;
20883660e330SKris Buschelman 
20893660e330SKris Buschelman     /* Make space in temp stack for 16 Byte Aligned arrays */
20903660e330SKris Buschelman     float         ssealignedspace[11],*tmps,*tmpx;
20913660e330SKris Buschelman     unsigned long offset = (unsigned long)ssealignedspace % 16;
20923660e330SKris Buschelman     if (offset) offset = (16 - offset)/4;
20933660e330SKris Buschelman     tmps = &ssealignedspace[offset];
20943660e330SKris Buschelman     tmpx = &ssealignedspace[offset+4];
20953660e330SKris Buschelman 
20963660e330SKris Buschelman   /* forward solve the lower triangular */
20973660e330SKris Buschelman     idx  = 0;
20983660e330SKris Buschelman     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
20993660e330SKris Buschelman     v    =  aa + 16*ai[1];
21003660e330SKris Buschelman 
21013660e330SKris Buschelman     for (i=1; i<n;) {
21023660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
21033660e330SKris Buschelman       vi   =  aj      + ai[i];
21043660e330SKris Buschelman       nz   =  diag[i] - ai[i];
21053660e330SKris Buschelman       idx +=  4;
21063660e330SKris Buschelman 
21073660e330SKris Buschelman     /* Demote sum from double to float */
21083660e330SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
21093660e330SKris Buschelman       LOAD_PS(tmps,XMM7);
21103660e330SKris Buschelman 
21113660e330SKris Buschelman       while (nz--) {
21123660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
21133660e330SKris Buschelman         jdx = 4*(*vi++);
21143660e330SKris Buschelman 
21153660e330SKris Buschelman         /* Demote solution (so far) from double to float */
21163660e330SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
21173660e330SKris Buschelman 
21183660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
21193660e330SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
21203660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
21213660e330SKris Buschelman 
21223660e330SKris Buschelman           /* First Column */
21233660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
21243660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
21253660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
21263660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
21273660e330SKris Buschelman 
21283660e330SKris Buschelman           /* Second Column */
21293660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
21303660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
21313660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
21323660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
21333660e330SKris Buschelman 
21343660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
21353660e330SKris Buschelman 
21363660e330SKris Buschelman           /* Third Column */
21373660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
21383660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
21393660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
21403660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
21413660e330SKris Buschelman 
21423660e330SKris Buschelman           /* Fourth Column */
21433660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
21443660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
21453660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
21463660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
21473660e330SKris Buschelman         SSE_INLINE_END_2
21483660e330SKris Buschelman 
21493660e330SKris Buschelman         v  += 16;
21503660e330SKris Buschelman       }
21513660e330SKris Buschelman       v    =  aa + 16*ai[++i];
21523660e330SKris Buschelman       PREFETCH_NTA(v);
21533660e330SKris Buschelman       STORE_PS(tmps,XMM7);
21543660e330SKris Buschelman 
21553660e330SKris Buschelman       /* Promote result from float to double */
21563660e330SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
21573660e330SKris Buschelman     }
21583660e330SKris Buschelman     /* backward solve the upper triangular */
21593660e330SKris Buschelman     idt  = 4*(n-1);
21603660e330SKris Buschelman     ai16 = 16*diag[n-1];
21613660e330SKris Buschelman     v    = aa + ai16 + 16;
21623660e330SKris Buschelman     for (i=n-1; i>=0;){
21633660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
21643660e330SKris Buschelman       vi = aj + diag[i] + 1;
21653660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
21663660e330SKris Buschelman 
21673660e330SKris Buschelman       /* Demote accumulator from double to float */
21683660e330SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
21693660e330SKris Buschelman       LOAD_PS(tmps,XMM7);
21703660e330SKris Buschelman 
21713660e330SKris Buschelman       while (nz--) {
21723660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
21733660e330SKris Buschelman         idx = 4*(*vi++);
21743660e330SKris Buschelman 
21753660e330SKris Buschelman         /* Demote solution (so far) from double to float */
21763660e330SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
21773660e330SKris Buschelman 
21783660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
21793660e330SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
21803660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
21813660e330SKris Buschelman 
21823660e330SKris Buschelman           /* First Column */
21833660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
21843660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
21853660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
21863660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
21873660e330SKris Buschelman 
21883660e330SKris Buschelman           /* Second Column */
21893660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
21903660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
21913660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
21923660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
21933660e330SKris Buschelman 
21943660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
21953660e330SKris Buschelman 
21963660e330SKris Buschelman           /* Third Column */
21973660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
21983660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
21993660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
22003660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
22013660e330SKris Buschelman 
22023660e330SKris Buschelman           /* Fourth Column */
22033660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
22043660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
22053660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
22063660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
22073660e330SKris Buschelman         SSE_INLINE_END_2
22083660e330SKris Buschelman         v  += 16;
22093660e330SKris Buschelman       }
22103660e330SKris Buschelman       v    = aa + ai16;
22113660e330SKris Buschelman       ai16 = 16*diag[--i];
22123660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
22133660e330SKris Buschelman       /*
22143660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
22153660e330SKris Buschelman          which was inverted as part of the factorization
22163660e330SKris Buschelman       */
22173660e330SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
22183660e330SKris Buschelman         /* First Column */
22193660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
22203660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
22213660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
22223660e330SKris Buschelman 
22233660e330SKris Buschelman         /* Second Column */
22243660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
22253660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
22263660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
22273660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
22283660e330SKris Buschelman 
22293660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
22303660e330SKris Buschelman 
22313660e330SKris Buschelman         /* Third Column */
22323660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
22333660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
22343660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
22353660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
22363660e330SKris Buschelman 
22373660e330SKris Buschelman         /* Fourth Column */
22383660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
22393660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
22403660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
22413660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
22423660e330SKris Buschelman 
22433660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
22443660e330SKris Buschelman       SSE_INLINE_END_3
22453660e330SKris Buschelman 
22463660e330SKris Buschelman       /* Promote solution from float to double */
22473660e330SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
22483660e330SKris Buschelman 
22493660e330SKris Buschelman       v    = aa + ai16 + 16;
22503660e330SKris Buschelman       idt -= 4;
22513660e330SKris Buschelman     }
22523660e330SKris Buschelman   }
22533660e330SKris Buschelman   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
22543660e330SKris Buschelman   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22553660e330SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
22563660e330SKris Buschelman   SSE_SCOPE_END;
22573660e330SKris Buschelman   PetscFunctionReturn(0);
22583660e330SKris Buschelman }
22593660e330SKris Buschelman 
22603660e330SKris Buschelman #endif
22614a2ae208SSatish Balay #undef __FUNCT__
22624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
22634e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
22644e2b4712SSatish Balay {
22654e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
22664e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
22674e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
22684e2b4712SSatish Balay   int             *diag = a->diag;
22693f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
2270f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,s3,x1,x2,x3,*t;
22714e2b4712SSatish Balay 
22724e2b4712SSatish Balay   PetscFunctionBegin;
2273e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2274e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2275f1af5d2fSBarry Smith   t  = a->solve_work;
22764e2b4712SSatish Balay 
22774e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22784e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
22794e2b4712SSatish Balay 
22804e2b4712SSatish Balay   /* forward solve the lower triangular */
22814e2b4712SSatish Balay   idx    = 3*(*r++);
2282f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
22834e2b4712SSatish Balay   for (i=1; i<n; i++) {
22844e2b4712SSatish Balay     v     = aa + 9*ai[i];
22854e2b4712SSatish Balay     vi    = aj + ai[i];
22864e2b4712SSatish Balay     nz    = diag[i] - ai[i];
22874e2b4712SSatish Balay     idx   = 3*(*r++);
2288f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
22894e2b4712SSatish Balay     while (nz--) {
22904e2b4712SSatish Balay       idx   = 3*(*vi++);
2291f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2292f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2293f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2294f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
22954e2b4712SSatish Balay       v += 9;
22964e2b4712SSatish Balay     }
22974e2b4712SSatish Balay     idx = 3*i;
2298f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
22994e2b4712SSatish Balay   }
23004e2b4712SSatish Balay   /* backward solve the upper triangular */
23014e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
23024e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
23034e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
23044e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
23054e2b4712SSatish Balay     idt  = 3*i;
2306f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
23074e2b4712SSatish Balay     while (nz--) {
23084e2b4712SSatish Balay       idx   = 3*(*vi++);
2309f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2310f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2311f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2312f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
23134e2b4712SSatish Balay       v += 9;
23144e2b4712SSatish Balay     }
23154e2b4712SSatish Balay     idc = 3*(*c--);
23164e2b4712SSatish Balay     v   = aa + 9*diag[i];
2317f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2318f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2319f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
23204e2b4712SSatish Balay   }
23214e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23224e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2323e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2324e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2325b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
23264e2b4712SSatish Balay   PetscFunctionReturn(0);
23274e2b4712SSatish Balay }
23284e2b4712SSatish Balay 
232915091d37SBarry Smith /*
233015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
233115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
233215091d37SBarry Smith */
23334a2ae208SSatish Balay #undef __FUNCT__
23344a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
233515091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
233615091d37SBarry Smith {
233715091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
233815091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
233915091d37SBarry Smith   int             ierr,*diag = a->diag;
234015091d37SBarry Smith   MatScalar       *aa=a->a,*v;
2341f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,s3,x1,x2,x3;
234215091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
234315091d37SBarry Smith 
234415091d37SBarry Smith   PetscFunctionBegin;
234515091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
234615091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
234715091d37SBarry Smith 
234815091d37SBarry Smith 
234915091d37SBarry Smith   /* forward solve the lower triangular */
235015091d37SBarry Smith   idx    = 0;
235115091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
235215091d37SBarry Smith   for (i=1; i<n; i++) {
235315091d37SBarry Smith     v     =  aa      + 9*ai[i];
235415091d37SBarry Smith     vi    =  aj      + ai[i];
235515091d37SBarry Smith     nz    =  diag[i] - ai[i];
235615091d37SBarry Smith     idx   +=  3;
2357f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
235815091d37SBarry Smith     while (nz--) {
235915091d37SBarry Smith       jdx   = 3*(*vi++);
236015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2361f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2362f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2363f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
236415091d37SBarry Smith       v    += 9;
236515091d37SBarry Smith     }
2366f1af5d2fSBarry Smith     x[idx]   = s1;
2367f1af5d2fSBarry Smith     x[1+idx] = s2;
2368f1af5d2fSBarry Smith     x[2+idx] = s3;
236915091d37SBarry Smith   }
237015091d37SBarry Smith   /* backward solve the upper triangular */
237115091d37SBarry Smith   for (i=n-1; i>=0; i--){
237215091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
237315091d37SBarry Smith     vi   = aj + diag[i] + 1;
237415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
237515091d37SBarry Smith     idt  = 3*i;
2376f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2377f1af5d2fSBarry Smith     s3 = x[2+idt];
237815091d37SBarry Smith     while (nz--) {
237915091d37SBarry Smith       idx   = 3*(*vi++);
238015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2381f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2382f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2383f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
238415091d37SBarry Smith       v    += 9;
238515091d37SBarry Smith     }
238615091d37SBarry Smith     v        = aa +  9*diag[i];
2387f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2388f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2389f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
239015091d37SBarry Smith   }
239115091d37SBarry Smith 
239215091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
239315091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2394b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
239515091d37SBarry Smith   PetscFunctionReturn(0);
239615091d37SBarry Smith }
239715091d37SBarry Smith 
23984a2ae208SSatish Balay #undef __FUNCT__
23994a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
24004e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
24014e2b4712SSatish Balay {
24024e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
24034e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
24044e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
24054e2b4712SSatish Balay   int             *diag = a->diag;
24063f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
2407f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,x1,x2,*t;
24084e2b4712SSatish Balay 
24094e2b4712SSatish Balay   PetscFunctionBegin;
2410e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2411e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2412f1af5d2fSBarry Smith   t  = a->solve_work;
24134e2b4712SSatish Balay 
24144e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
24154e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
24164e2b4712SSatish Balay 
24174e2b4712SSatish Balay   /* forward solve the lower triangular */
24184e2b4712SSatish Balay   idx    = 2*(*r++);
2419f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
24204e2b4712SSatish Balay   for (i=1; i<n; i++) {
24214e2b4712SSatish Balay     v     = aa + 4*ai[i];
24224e2b4712SSatish Balay     vi    = aj + ai[i];
24234e2b4712SSatish Balay     nz    = diag[i] - ai[i];
24244e2b4712SSatish Balay     idx   = 2*(*r++);
2425f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
24264e2b4712SSatish Balay     while (nz--) {
24274e2b4712SSatish Balay       idx   = 2*(*vi++);
2428f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2429f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2430f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
24314e2b4712SSatish Balay       v += 4;
24324e2b4712SSatish Balay     }
24334e2b4712SSatish Balay     idx = 2*i;
2434f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
24354e2b4712SSatish Balay   }
24364e2b4712SSatish Balay   /* backward solve the upper triangular */
24374e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
24384e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
24394e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
24404e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
24414e2b4712SSatish Balay     idt  = 2*i;
2442f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
24434e2b4712SSatish Balay     while (nz--) {
24444e2b4712SSatish Balay       idx   = 2*(*vi++);
2445f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2446f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2447f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
24484e2b4712SSatish Balay       v += 4;
24494e2b4712SSatish Balay     }
24504e2b4712SSatish Balay     idc = 2*(*c--);
24514e2b4712SSatish Balay     v   = aa + 4*diag[i];
2452f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2453f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
24544e2b4712SSatish Balay   }
24554e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
24564e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2457e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2458e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2459b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
24604e2b4712SSatish Balay   PetscFunctionReturn(0);
24614e2b4712SSatish Balay }
24624e2b4712SSatish Balay 
246315091d37SBarry Smith /*
246415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
246515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
246615091d37SBarry Smith */
24674a2ae208SSatish Balay #undef __FUNCT__
24684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
246915091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
247015091d37SBarry Smith {
247115091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
247215091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
247315091d37SBarry Smith   int             ierr,*diag = a->diag;
247415091d37SBarry Smith   MatScalar       *aa=a->a,*v;
2475f1af5d2fSBarry Smith   Scalar          *x,*b,s1,s2,x1,x2;
247615091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
247715091d37SBarry Smith 
247815091d37SBarry Smith   PetscFunctionBegin;
247915091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
248015091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248115091d37SBarry Smith 
248215091d37SBarry Smith   /* forward solve the lower triangular */
248315091d37SBarry Smith   idx    = 0;
248415091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
248515091d37SBarry Smith   for (i=1; i<n; i++) {
248615091d37SBarry Smith     v     =  aa      + 4*ai[i];
248715091d37SBarry Smith     vi    =  aj      + ai[i];
248815091d37SBarry Smith     nz    =  diag[i] - ai[i];
248915091d37SBarry Smith     idx   +=  2;
2490f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
249115091d37SBarry Smith     while (nz--) {
249215091d37SBarry Smith       jdx   = 2*(*vi++);
249315091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2494f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2495f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
249615091d37SBarry Smith       v    += 4;
249715091d37SBarry Smith     }
2498f1af5d2fSBarry Smith     x[idx]   = s1;
2499f1af5d2fSBarry Smith     x[1+idx] = s2;
250015091d37SBarry Smith   }
250115091d37SBarry Smith   /* backward solve the upper triangular */
250215091d37SBarry Smith   for (i=n-1; i>=0; i--){
250315091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
250415091d37SBarry Smith     vi   = aj + diag[i] + 1;
250515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
250615091d37SBarry Smith     idt  = 2*i;
2507f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
250815091d37SBarry Smith     while (nz--) {
250915091d37SBarry Smith       idx   = 2*(*vi++);
251015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2511f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2512f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
251315091d37SBarry Smith       v    += 4;
251415091d37SBarry Smith     }
251515091d37SBarry Smith     v        = aa +  4*diag[i];
2516f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2517f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
251815091d37SBarry Smith   }
251915091d37SBarry Smith 
252015091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
252115091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2522b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
252315091d37SBarry Smith   PetscFunctionReturn(0);
252415091d37SBarry Smith }
252515091d37SBarry Smith 
25264a2ae208SSatish Balay #undef __FUNCT__
25274a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
25284e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
25294e2b4712SSatish Balay {
25304e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
25314e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
25324e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
25334e2b4712SSatish Balay   int             *diag = a->diag;
25343f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
2535f1af5d2fSBarry Smith   Scalar          *x,*b,s1,*t;
25364e2b4712SSatish Balay 
25374e2b4712SSatish Balay   PetscFunctionBegin;
25384e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
25394e2b4712SSatish Balay 
2540e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2541e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2542f1af5d2fSBarry Smith   t  = a->solve_work;
25434e2b4712SSatish Balay 
25444e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25454e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25464e2b4712SSatish Balay 
25474e2b4712SSatish Balay   /* forward solve the lower triangular */
2548f1af5d2fSBarry Smith   t[0] = b[*r++];
25494e2b4712SSatish Balay   for (i=1; i<n; i++) {
25504e2b4712SSatish Balay     v     = aa + ai[i];
25514e2b4712SSatish Balay     vi    = aj + ai[i];
25524e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2553f1af5d2fSBarry Smith     s1  = b[*r++];
25544e2b4712SSatish Balay     while (nz--) {
2555f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
25564e2b4712SSatish Balay     }
2557f1af5d2fSBarry Smith     t[i] = s1;
25584e2b4712SSatish Balay   }
25594e2b4712SSatish Balay   /* backward solve the upper triangular */
25604e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
25614e2b4712SSatish Balay     v    = aa + diag[i] + 1;
25624e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
25634e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2564f1af5d2fSBarry Smith     s1 = t[i];
25654e2b4712SSatish Balay     while (nz--) {
2566f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
25674e2b4712SSatish Balay     }
2568f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
25694e2b4712SSatish Balay   }
25704e2b4712SSatish Balay 
25714e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
25724e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2573e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2574e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2575b0a32e0cSBarry Smith   PetscLogFlops(2*1*(a->nz) - A->n);
25764e2b4712SSatish Balay   PetscFunctionReturn(0);
25774e2b4712SSatish Balay }
257815091d37SBarry Smith /*
257915091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
258015091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
258115091d37SBarry Smith */
25824a2ae208SSatish Balay #undef __FUNCT__
25834a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
258415091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
258515091d37SBarry Smith {
258615091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
258715091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
258815091d37SBarry Smith   int             ierr,*diag = a->diag;
258915091d37SBarry Smith   MatScalar       *aa=a->a;
259015091d37SBarry Smith   Scalar          *x,*b;
2591f1af5d2fSBarry Smith   Scalar          s1,x1;
259215091d37SBarry Smith   MatScalar       *v;
259315091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
259415091d37SBarry Smith 
259515091d37SBarry Smith   PetscFunctionBegin;
259615091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
259715091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
259815091d37SBarry Smith 
259915091d37SBarry Smith   /* forward solve the lower triangular */
260015091d37SBarry Smith   idx    = 0;
260115091d37SBarry Smith   x[0]   = b[0];
260215091d37SBarry Smith   for (i=1; i<n; i++) {
260315091d37SBarry Smith     v     =  aa      + ai[i];
260415091d37SBarry Smith     vi    =  aj      + ai[i];
260515091d37SBarry Smith     nz    =  diag[i] - ai[i];
260615091d37SBarry Smith     idx   +=  1;
2607f1af5d2fSBarry Smith     s1  =  b[idx];
260815091d37SBarry Smith     while (nz--) {
260915091d37SBarry Smith       jdx   = *vi++;
261015091d37SBarry Smith       x1    = x[jdx];
2611f1af5d2fSBarry Smith       s1 -= v[0]*x1;
261215091d37SBarry Smith       v    += 1;
261315091d37SBarry Smith     }
2614f1af5d2fSBarry Smith     x[idx]   = s1;
261515091d37SBarry Smith   }
261615091d37SBarry Smith   /* backward solve the upper triangular */
261715091d37SBarry Smith   for (i=n-1; i>=0; i--){
261815091d37SBarry Smith     v    = aa + diag[i] + 1;
261915091d37SBarry Smith     vi   = aj + diag[i] + 1;
262015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
262115091d37SBarry Smith     idt  = i;
2622f1af5d2fSBarry Smith     s1 = x[idt];
262315091d37SBarry Smith     while (nz--) {
262415091d37SBarry Smith       idx   = *vi++;
262515091d37SBarry Smith       x1    = x[idx];
2626f1af5d2fSBarry Smith       s1 -= v[0]*x1;
262715091d37SBarry Smith       v    += 1;
262815091d37SBarry Smith     }
262915091d37SBarry Smith     v        = aa +  diag[i];
2630f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
263115091d37SBarry Smith   }
263215091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
263315091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2634b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
263515091d37SBarry Smith   PetscFunctionReturn(0);
263615091d37SBarry Smith }
26374e2b4712SSatish Balay 
26384e2b4712SSatish Balay /* ----------------------------------------------------------------*/
26394e2b4712SSatish Balay /*
26404e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
26414e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
26424e2b4712SSatish Balay    Not a good example of code reuse.
26434e2b4712SSatish Balay */
2644ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2645435faa5fSBarry Smith 
26464a2ae208SSatish Balay #undef __FUNCT__
26474a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
2648435faa5fSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
26494e2b4712SSatish Balay {
26504e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
26514e2b4712SSatish Balay   IS          isicol;
26524e2b4712SSatish Balay   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
26534e2b4712SSatish Balay   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2654335d9088SBarry Smith   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2655435faa5fSBarry Smith   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
26564533b203SBarry Smith   PetscTruth  col_identity,row_identity;
2657329f5518SBarry Smith   PetscReal   f;
26584e2b4712SSatish Balay 
26594e2b4712SSatish Balay   PetscFunctionBegin;
2660435faa5fSBarry Smith   if (info) {
2661435faa5fSBarry Smith     f             = info->fill;
2662335d9088SBarry Smith     levels        = (int)info->levels;
2663335d9088SBarry Smith     diagonal_fill = (int)info->diagonal_fill;
2664435faa5fSBarry Smith   } else {
2665435faa5fSBarry Smith     f             = 1.0;
2666435faa5fSBarry Smith     levels        = 0;
2667435faa5fSBarry Smith     diagonal_fill = 0;
2668435faa5fSBarry Smith   }
26694c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2670667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2671667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2672309c388cSBarry Smith 
2673309c388cSBarry Smith   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2674bb3d539aSBarry Smith     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2675bb3d539aSBarry Smith     (*fact)->factor = FACTOR_LU;
2676bb3d539aSBarry Smith     b               = (Mat_SeqBAIJ*)(*fact)->data;
2677bb3d539aSBarry Smith     if (!b->diag) {
2678bb3d539aSBarry Smith       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2679bb3d539aSBarry Smith     }
2680bb3d539aSBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2681bb3d539aSBarry Smith     b->row        = isrow;
2682bb3d539aSBarry Smith     b->col        = iscol;
2683bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2684bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2685bb3d539aSBarry Smith     b->icol       = isicol;
2686b0a32e0cSBarry Smith     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(Scalar),&b->solve_work);CHKERRQ(ierr);
2687309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
26884e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
26894e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
26904e2b4712SSatish Balay 
26914e2b4712SSatish Balay     /* get new row pointers */
2692b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
26934e2b4712SSatish Balay     ainew[0] = 0;
26944e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
26954e2b4712SSatish Balay     jmax = (int)(f*ai[n] + 1);
269682502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
26974e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
269882502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
26994e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
2700b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
27014e2b4712SSatish Balay     /* im is level for each filled value */
2702b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
27034e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
2704b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
27054e2b4712SSatish Balay     dloc[0]  = 0;
27064e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
2707435faa5fSBarry Smith 
2708435faa5fSBarry Smith       /* copy prow into linked list */
27094e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
271029bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
27114e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
27124e2b4712SSatish Balay       fill[n]    = n;
2713435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
27144e2b4712SSatish Balay       while (nz--) {
27154e2b4712SSatish Balay 	fm  = n;
27164e2b4712SSatish Balay 	idx = ic[*xi++];
27174e2b4712SSatish Balay 	do {
27184e2b4712SSatish Balay 	  m  = fm;
27194e2b4712SSatish Balay 	  fm = fill[m];
27204e2b4712SSatish Balay 	} while (fm < idx);
27214e2b4712SSatish Balay 	fill[m]   = idx;
27224e2b4712SSatish Balay 	fill[idx] = fm;
27234e2b4712SSatish Balay 	im[idx]   = 0;
27244e2b4712SSatish Balay       }
2725435faa5fSBarry Smith 
2726435faa5fSBarry Smith       /* make sure diagonal entry is included */
2727435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
2728435faa5fSBarry Smith 	fm = n;
2729435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
2730435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
2731435faa5fSBarry Smith 	fill[fm]   = prow;
2732435faa5fSBarry Smith 	im[prow]   = 0;
2733435faa5fSBarry Smith 	nzf++;
2734335d9088SBarry Smith 	dcount++;
2735435faa5fSBarry Smith       }
2736435faa5fSBarry Smith 
27374e2b4712SSatish Balay       nzi = 0;
27384e2b4712SSatish Balay       row = fill[n];
27394e2b4712SSatish Balay       while (row < prow) {
27404e2b4712SSatish Balay 	incrlev = im[row] + 1;
27414e2b4712SSatish Balay 	nz      = dloc[row];
2742435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
27434e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
27444e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
27454e2b4712SSatish Balay 	fm      = row;
27464e2b4712SSatish Balay 	while (nnz-- > 0) {
27474e2b4712SSatish Balay 	  idx = *xi++;
27484e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
27494e2b4712SSatish Balay 	    flev++;
27504e2b4712SSatish Balay 	    continue;
27514e2b4712SSatish Balay 	  }
27524e2b4712SSatish Balay 	  do {
27534e2b4712SSatish Balay 	    m  = fm;
27544e2b4712SSatish Balay 	    fm = fill[m];
27554e2b4712SSatish Balay 	  } while (fm < idx);
27564e2b4712SSatish Balay 	  if (fm != idx) {
27574e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
27584e2b4712SSatish Balay 	    fill[m]   = idx;
27594e2b4712SSatish Balay 	    fill[idx] = fm;
27604e2b4712SSatish Balay 	    fm        = idx;
27614e2b4712SSatish Balay 	    nzf++;
2762ecf371e4SBarry Smith 	  } else {
27634e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
27644e2b4712SSatish Balay 	  }
27654e2b4712SSatish Balay 	  flev++;
27664e2b4712SSatish Balay 	}
27674e2b4712SSatish Balay 	row = fill[row];
27684e2b4712SSatish Balay 	nzi++;
27694e2b4712SSatish Balay       }
27704e2b4712SSatish Balay       /* copy new filled row into permanent storage */
27714e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
27724e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
2773ecf371e4SBarry Smith 
2774ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
2775ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2776ecf371e4SBarry Smith 	/* just double the memory each time */
2777ecf371e4SBarry Smith 	int maxadd = jmax;
2778ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
27794e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
27804e2b4712SSatish Balay 	jmax += maxadd;
2781ecf371e4SBarry Smith 
2782ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
278382502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2784549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2785606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
27864e2b4712SSatish Balay 	ajnew = xi;
278782502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2788549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2789606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
27904e2b4712SSatish Balay 	ajfill = xi;
2791ecf371e4SBarry Smith 	realloc++; /* count how many reallocations are needed */
27924e2b4712SSatish Balay       }
27934e2b4712SSatish Balay       xi          = ajnew + ainew[prow];
27944e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
27954e2b4712SSatish Balay       dloc[prow]  = nzi;
27964e2b4712SSatish Balay       fm          = fill[n];
27974e2b4712SSatish Balay       while (nzf--) {
27984e2b4712SSatish Balay 	*xi++   = fm;
27994e2b4712SSatish Balay 	*flev++ = im[fm];
28004e2b4712SSatish Balay 	fm      = fill[fm];
28014e2b4712SSatish Balay       }
2802435faa5fSBarry Smith       /* make sure row has diagonal entry */
2803435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
280429bbc08cSBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
2805435faa5fSBarry Smith     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2806435faa5fSBarry Smith       }
28074e2b4712SSatish Balay     }
2808606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
28094e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
28104e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2811606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
2812606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
28134e2b4712SSatish Balay 
28144e2b4712SSatish Balay     {
2815329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2816b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2817b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
2818b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
2819b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
2820335d9088SBarry Smith       if (diagonal_fill) {
2821b0a32e0cSBarry Smith 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2822335d9088SBarry Smith       }
28234e2b4712SSatish Balay     }
28244e2b4712SSatish Balay 
28254e2b4712SSatish Balay     /* put together the new matrix */
28264e2b4712SSatish Balay     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
2827b0a32e0cSBarry Smith     PetscLogObjectParent(*fact,isicol);
28284e2b4712SSatish Balay     b = (Mat_SeqBAIJ*)(*fact)->data;
2829606d414cSSatish Balay     ierr = PetscFree(b->imax);CHKERRQ(ierr);
28307c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
28313f1db9ecSBarry Smith     len = bs2*ainew[n]*sizeof(MatScalar);
28324e2b4712SSatish Balay     /* the next line frees the default space generated by the Create() */
2833606d414cSSatish Balay     ierr = PetscFree(b->a);CHKERRQ(ierr);
2834606d414cSSatish Balay     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
283582502324SSatish Balay     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
28364e2b4712SSatish Balay     b->j          = ajnew;
28374e2b4712SSatish Balay     b->i          = ainew;
28384e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
28394e2b4712SSatish Balay     b->diag       = dloc;
28404e2b4712SSatish Balay     b->ilen       = 0;
28414e2b4712SSatish Balay     b->imax       = 0;
28424e2b4712SSatish Balay     b->row        = isrow;
28434e2b4712SSatish Balay     b->col        = iscol;
2844c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2845c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2846e51c0b9cSSatish Balay     b->icol       = isicol;
2847b0a32e0cSBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(Scalar),&b->solve_work);CHKERRQ(ierr);
28484e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
28494e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
2850b0a32e0cSBarry Smith     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(Scalar));
28514e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
28524e2b4712SSatish Balay     (*fact)->factor   = FACTOR_LU;
28534e2b4712SSatish Balay 
28544e2b4712SSatish Balay     (*fact)->info.factor_mallocs    = realloc;
28554e2b4712SSatish Balay     (*fact)->info.fill_ratio_given  = f;
2856329f5518SBarry Smith     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2857309c388cSBarry Smith   }
28584e2b4712SSatish Balay 
2859309c388cSBarry Smith   if (row_identity && col_identity) {
2860732ee342SKris Buschelman     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
28618661488fSKris Buschelman   }
28628661488fSKris Buschelman   PetscFunctionReturn(0);
28638661488fSKris Buschelman }
28648661488fSKris Buschelman 
2865732ee342SKris Buschelman #undef __FUNCT__
2866732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
2867732ee342SKris Buschelman int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
28688661488fSKris Buschelman {
28698661488fSKris Buschelman   /*
28708661488fSKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
28718661488fSKris Buschelman       with natural ordering
28728661488fSKris Buschelman   */
28738661488fSKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
2874732ee342SKris Buschelman   PetscTruth  sse_enabled;
2875732ee342SKris Buschelman   int         ierr;
28768661488fSKris Buschelman 
28778661488fSKris Buschelman   PetscFunctionBegin;
28788661488fSKris Buschelman   switch (a->bs) {
28798661488fSKris Buschelman   case 1:
28808661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2881732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
2882732ee342SKris Buschelman     break;
2883309c388cSBarry Smith   case 2:
28848661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2885732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
2886309c388cSBarry Smith     break;
2887309c388cSBarry Smith   case 3:
28888661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2889732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
2890309c388cSBarry Smith     break;
2891309c388cSBarry Smith   case 4:
289283b3e5c4SKris Buschelman     ierr = PetscSSEIsEnabled(&sse_enabled);CHKERRQ(ierr);
289383b3e5c4SKris Buschelman     if (sse_enabled) {
2894*b988c221SKris Buschelman #if defined(PETSC_HAVE_SSE)
28958661488fSKris Buschelman       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
2896732ee342SKris Buschelman       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n");
2897*b988c221SKris Buschelman #else
2898*b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2899*b988c221SKris Buschelman       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2900*b988c221SKris Buschelman #endif
29013ba47ebaSKris Buschelman     } else {
29028661488fSKris Buschelman       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2903732ee342SKris Buschelman       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
29043ba47ebaSKris Buschelman     }
2905309c388cSBarry Smith     break;
2906309c388cSBarry Smith   case 5:
29078661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2908732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
2909309c388cSBarry Smith     break;
2910309c388cSBarry Smith   case 6:
29118661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2912732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
2913309c388cSBarry Smith     break;
2914309c388cSBarry Smith   case 7:
29158661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2916732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
2917309c388cSBarry Smith     break;
2918309c388cSBarry Smith   }
29194e2b4712SSatish Balay   PetscFunctionReturn(0);
29204e2b4712SSatish Balay }
2921732ee342SKris Buschelman 
2922732ee342SKris Buschelman #undef __FUNCT__
2923732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
2924732ee342SKris Buschelman int MatSeqBAIJ_UpdateSolvers(Mat A)
2925732ee342SKris Buschelman {
2926732ee342SKris Buschelman   /*
2927732ee342SKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2928732ee342SKris Buschelman       with natural ordering
2929732ee342SKris Buschelman   */
2930732ee342SKris Buschelman   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2931732ee342SKris Buschelman   IS          row = a->row, col = a->col;
2932732ee342SKris Buschelman   PetscTruth  row_identity, col_identity;
2933732ee342SKris Buschelman   PetscTruth  sse_enabled,use_single,use_natural;
2934732ee342SKris Buschelman   int         ierr;
2935732ee342SKris Buschelman 
2936732ee342SKris Buschelman   PetscFunctionBegin;
2937732ee342SKris Buschelman   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
2938732ee342SKris Buschelman   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
2939732ee342SKris Buschelman 
2940732ee342SKris Buschelman   if (row_identity && col_identity) {
2941732ee342SKris Buschelman     use_natural = PETSC_TRUE;
2942732ee342SKris Buschelman   } else {
2943732ee342SKris Buschelman     use_natural = PETSC_FALSE;
2944732ee342SKris Buschelman   }
2945732ee342SKris Buschelman   switch (a->bs) {
2946732ee342SKris Buschelman   case 1:
2947732ee342SKris Buschelman     if (use_natural) {
2948732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
2949732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
2950732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
2951732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2952732ee342SKris Buschelman     } else {
2953732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1;
2954732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
2955732ee342SKris Buschelman     }
2956732ee342SKris Buschelman     break;
2957732ee342SKris Buschelman   case 2:
2958732ee342SKris Buschelman     if (use_natural) {
2959732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2960732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
2961732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
2962732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2963732ee342SKris Buschelman     } else {
2964732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2;
2965732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
2966732ee342SKris Buschelman     }
2967732ee342SKris Buschelman     break;
2968732ee342SKris Buschelman   case 3:
2969732ee342SKris Buschelman     if (use_natural) {
2970732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2971732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
2972732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
2973732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2974732ee342SKris Buschelman     } else {
2975732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3;
2976732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
2977732ee342SKris Buschelman     }
2978732ee342SKris Buschelman     break;
2979732ee342SKris Buschelman   case 4:
2980732ee342SKris Buschelman     ierr = PetscSSEIsEnabled(&sse_enabled);CHKERRQ(ierr);
2981732ee342SKris Buschelman     if (sse_enabled) {
2982732ee342SKris Buschelman       if (a->single_precision_solves) {
2983732ee342SKris Buschelman         use_single = PETSC_TRUE;
2984732ee342SKris Buschelman       }
2985732ee342SKris Buschelman     } else {
2986732ee342SKris Buschelman       use_single = PETSC_FALSE;
2987732ee342SKris Buschelman     }
2988732ee342SKris Buschelman     if (use_natural) {
2989732ee342SKris Buschelman       if (use_single) {
2990*b988c221SKris Buschelman #if defined(PETSC_HAVE_SSE)
2991732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
2992732ee342SKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n");
2993*b988c221SKris Buschelman #else
2994*b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2995*b988c221SKris Buschelman       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2996*b988c221SKris Buschelman #endif
2997732ee342SKris Buschelman       } else {
2998732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
2999732ee342SKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3000732ee342SKris Buschelman       }
3001732ee342SKris Buschelman       A->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3002732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3003732ee342SKris Buschelman     } else {
3004732ee342SKris Buschelman       if (use_single) {
3005*b988c221SKris Buschelman #if defined(PETSC_HAVE_SSE)
3006732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_SSE_Demotion;
3007732ee342SKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3008*b988c221SKris Buschelman #else
3009*b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3010*b988c221SKris Buschelman       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3011*b988c221SKris Buschelman #endif
3012732ee342SKris Buschelman       } else {
3013732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4;
3014732ee342SKris Buschelman       }
3015732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3016732ee342SKris Buschelman     }
3017732ee342SKris Buschelman     break;
3018732ee342SKris Buschelman   case 5:
3019732ee342SKris Buschelman     if (use_natural) {
3020732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3021732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3022732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3023732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3024732ee342SKris Buschelman     } else {
3025732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5;
3026732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3027732ee342SKris Buschelman     }
3028732ee342SKris Buschelman     break;
3029732ee342SKris Buschelman   case 6:
3030732ee342SKris Buschelman     if (use_natural) {
3031732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3032732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3033732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3034732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3035732ee342SKris Buschelman     } else {
3036732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6;
3037732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3038732ee342SKris Buschelman     }
3039732ee342SKris Buschelman     break;
3040732ee342SKris Buschelman   case 7:
3041732ee342SKris Buschelman     if (use_natural) {
3042732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3043732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3044732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3045732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3046732ee342SKris Buschelman     } else {
3047732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7;
3048732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3049732ee342SKris Buschelman     }
3050732ee342SKris Buschelman     break;
305131801e53SKris Buschelman   default:
305231801e53SKris Buschelman     A->ops->solve             = MatSolve_SeqBAIJ_N;
305331801e53SKris Buschelman     break;
3054732ee342SKris Buschelman   }
3055732ee342SKris Buschelman   PetscFunctionReturn(0);
3056732ee342SKris Buschelman }
3057732ee342SKris Buschelman 
3058732ee342SKris Buschelman #undef __FUNCT__
3059732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3060732ee342SKris Buschelman int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3061732ee342SKris Buschelman   int ierr;
3062732ee342SKris Buschelman 
3063732ee342SKris Buschelman   PetscFunctionBegin;
3064732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3065732ee342SKris Buschelman   ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3066732ee342SKris Buschelman   PetscFunctionReturn(0);
3067732ee342SKris Buschelman }
3068732ee342SKris Buschelman 
3069732ee342SKris Buschelman #undef __FUNCT__
3070732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3071732ee342SKris Buschelman int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3072732ee342SKris Buschelman   int ierr;
3073732ee342SKris Buschelman 
3074732ee342SKris Buschelman   PetscFunctionBegin;
3075732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3076732ee342SKris Buschelman   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3077732ee342SKris Buschelman   PetscFunctionReturn(0);
3078732ee342SKris Buschelman }
3079732ee342SKris Buschelman 
3080732ee342SKris Buschelman 
3081