xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision f204ca49f059977577f8ff3e841e7b73f69ef32d)
173f4d377SMatthew Knepley /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
74e2b4712SSatish Balay #include "src/inline/ilu.h"
874c49faeSBarry Smith #include "src/inline/dot.h"
94e2b4712SSatish Balay 
104a2ae208SSatish Balay #undef __FUNCT__
114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
127c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
13f1af5d2fSBarry Smith {
14f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
16f1af5d2fSBarry Smith   int             *diag = a->diag;
17f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
1887828ca2SBarry Smith   PetscScalar     s1,*x,*b;
19f1af5d2fSBarry Smith 
20f1af5d2fSBarry Smith   PetscFunctionBegin;
21ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
22b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
23b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
24f1af5d2fSBarry Smith 
25f1af5d2fSBarry Smith   /* forward solve the U^T */
26f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
27f1af5d2fSBarry Smith 
28f1af5d2fSBarry Smith     v     = aa + diag[i];
29f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
30ef66eb69SBarry Smith     s1    = (*v++)*x[i];
31f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
32f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
33f1af5d2fSBarry Smith     while (nz--) {
34f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
35f1af5d2fSBarry Smith     }
36f1af5d2fSBarry Smith     x[i]   = s1;
37f1af5d2fSBarry Smith   }
38f1af5d2fSBarry Smith   /* backward solve the L^T */
39f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
40f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
41f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
42f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
43f1af5d2fSBarry Smith     s1   = x[i];
44f1af5d2fSBarry Smith     while (nz--) {
45f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
46f1af5d2fSBarry Smith     }
47f1af5d2fSBarry Smith   }
48b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
49b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
50b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
51f1af5d2fSBarry Smith   PetscFunctionReturn(0);
52f1af5d2fSBarry Smith }
53f1af5d2fSBarry Smith 
544a2ae208SSatish Balay #undef __FUNCT__
554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
567c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
57f1af5d2fSBarry Smith {
58f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
59f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
61f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
6287828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
6387828ca2SBarry Smith   PetscScalar     *x,*b;
64f1af5d2fSBarry Smith 
65f1af5d2fSBarry Smith   PetscFunctionBegin;
66ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
67b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
68b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   /* forward solve the U^T */
71f1af5d2fSBarry Smith   idx = 0;
72f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
73f1af5d2fSBarry Smith 
74f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
75f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
76ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
77f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
78f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
79f1af5d2fSBarry Smith     v += 4;
80f1af5d2fSBarry Smith 
81f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
82f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
83f1af5d2fSBarry Smith     while (nz--) {
84f1af5d2fSBarry Smith       oidx = 2*(*vi++);
85f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
86f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
87f1af5d2fSBarry Smith       v  += 4;
88f1af5d2fSBarry Smith     }
89f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
90f1af5d2fSBarry Smith     idx += 2;
91f1af5d2fSBarry Smith   }
92f1af5d2fSBarry Smith   /* backward solve the L^T */
93f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
94f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
95f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
96f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
97f1af5d2fSBarry Smith     idt  = 2*i;
98f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
99f1af5d2fSBarry Smith     while (nz--) {
100f1af5d2fSBarry Smith       idx   = 2*(*vi--);
101f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
102f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
103f1af5d2fSBarry Smith       v -= 4;
104f1af5d2fSBarry Smith     }
105f1af5d2fSBarry Smith   }
106b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
107b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
108b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
109f1af5d2fSBarry Smith   PetscFunctionReturn(0);
110f1af5d2fSBarry Smith }
111f1af5d2fSBarry Smith 
1124a2ae208SSatish Balay #undef __FUNCT__
1134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
1147c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
115f1af5d2fSBarry Smith {
116f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
117f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
118f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
119f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
12087828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
12187828ca2SBarry Smith   PetscScalar     *x,*b;
122f1af5d2fSBarry Smith 
123f1af5d2fSBarry Smith   PetscFunctionBegin;
124ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
125b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
126b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
127f1af5d2fSBarry Smith 
128f1af5d2fSBarry Smith   /* forward solve the U^T */
129f1af5d2fSBarry Smith   idx = 0;
130f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
133f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
134ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
135f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
136f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
137f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
138f1af5d2fSBarry Smith     v += 9;
139f1af5d2fSBarry Smith 
140f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
141f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
142f1af5d2fSBarry Smith     while (nz--) {
143f1af5d2fSBarry Smith       oidx = 3*(*vi++);
144f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
145f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
146f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
147f1af5d2fSBarry Smith       v  += 9;
148f1af5d2fSBarry Smith     }
149f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
150f1af5d2fSBarry Smith     idx += 3;
151f1af5d2fSBarry Smith   }
152f1af5d2fSBarry Smith   /* backward solve the L^T */
153f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
154f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
155f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
156f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
157f1af5d2fSBarry Smith     idt  = 3*i;
158f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
159f1af5d2fSBarry Smith     while (nz--) {
160f1af5d2fSBarry Smith       idx   = 3*(*vi--);
161f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
162f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
163f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
164f1af5d2fSBarry Smith       v -= 9;
165f1af5d2fSBarry Smith     }
166f1af5d2fSBarry Smith   }
167b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
168b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
169b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
170f1af5d2fSBarry Smith   PetscFunctionReturn(0);
171f1af5d2fSBarry Smith }
172f1af5d2fSBarry Smith 
1734a2ae208SSatish Balay #undef __FUNCT__
1744a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
1757c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
176f1af5d2fSBarry Smith {
177f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
178f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
179f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
180f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
18187828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
18287828ca2SBarry Smith   PetscScalar     *x,*b;
183f1af5d2fSBarry Smith 
184f1af5d2fSBarry Smith   PetscFunctionBegin;
185ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
186b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
187b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
188f1af5d2fSBarry Smith 
189f1af5d2fSBarry Smith   /* forward solve the U^T */
190f1af5d2fSBarry Smith   idx = 0;
191f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
192f1af5d2fSBarry Smith 
193f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
194f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
195ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
196f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
197f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
198f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
199f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
200f1af5d2fSBarry Smith     v += 16;
201f1af5d2fSBarry Smith 
202f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
203f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
204f1af5d2fSBarry Smith     while (nz--) {
205f1af5d2fSBarry Smith       oidx = 4*(*vi++);
206f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
207f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
208f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
209f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
210f1af5d2fSBarry Smith       v  += 16;
211f1af5d2fSBarry Smith     }
212f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
213f1af5d2fSBarry Smith     idx += 4;
214f1af5d2fSBarry Smith   }
215f1af5d2fSBarry Smith   /* backward solve the L^T */
216f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
217f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
218f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
219f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
220f1af5d2fSBarry Smith     idt  = 4*i;
221f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
222f1af5d2fSBarry Smith     while (nz--) {
223f1af5d2fSBarry Smith       idx   = 4*(*vi--);
224f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
225f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
226f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
227f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
228f1af5d2fSBarry Smith       v -= 16;
229f1af5d2fSBarry Smith     }
230f1af5d2fSBarry Smith   }
231b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
232b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
233b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
234f1af5d2fSBarry Smith   PetscFunctionReturn(0);
235f1af5d2fSBarry Smith }
236f1af5d2fSBarry Smith 
2374a2ae208SSatish Balay #undef __FUNCT__
2384a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
2397c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
240f1af5d2fSBarry Smith {
241f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
242f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
243f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
244f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
24587828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
24687828ca2SBarry Smith   PetscScalar     *x,*b;
247f1af5d2fSBarry Smith 
248f1af5d2fSBarry Smith   PetscFunctionBegin;
249ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
250b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
251b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
252f1af5d2fSBarry Smith 
253f1af5d2fSBarry Smith   /* forward solve the U^T */
254f1af5d2fSBarry Smith   idx = 0;
255f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
256f1af5d2fSBarry Smith 
257f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
258f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
259ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
260f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
261f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
262f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
263f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
264f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
265f1af5d2fSBarry Smith     v += 25;
266f1af5d2fSBarry Smith 
267f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
268f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
269f1af5d2fSBarry Smith     while (nz--) {
270f1af5d2fSBarry Smith       oidx = 5*(*vi++);
271f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
272f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
273f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
274f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
275f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
276f1af5d2fSBarry Smith       v  += 25;
277f1af5d2fSBarry Smith     }
278f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
279f1af5d2fSBarry Smith     idx += 5;
280f1af5d2fSBarry Smith   }
281f1af5d2fSBarry Smith   /* backward solve the L^T */
282f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
283f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
284f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
285f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
286f1af5d2fSBarry Smith     idt  = 5*i;
287f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
288f1af5d2fSBarry Smith     while (nz--) {
289f1af5d2fSBarry Smith       idx   = 5*(*vi--);
290f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
291f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
292f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
293f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
294f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
295f1af5d2fSBarry Smith       v -= 25;
296f1af5d2fSBarry Smith     }
297f1af5d2fSBarry Smith   }
298b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
299b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
300b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
301f1af5d2fSBarry Smith   PetscFunctionReturn(0);
302f1af5d2fSBarry Smith }
303f1af5d2fSBarry Smith 
3044a2ae208SSatish Balay #undef __FUNCT__
3054a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
3067c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
307f1af5d2fSBarry Smith {
308f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
309f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
310f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
311f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
31287828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
31387828ca2SBarry Smith   PetscScalar     *x,*b;
314f1af5d2fSBarry Smith 
315f1af5d2fSBarry Smith   PetscFunctionBegin;
316ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
317b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
318b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
319f1af5d2fSBarry Smith 
320f1af5d2fSBarry Smith   /* forward solve the U^T */
321f1af5d2fSBarry Smith   idx = 0;
322f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
325f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
326ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
327ef66eb69SBarry Smith     x6    = x[5+idx];
328f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
329f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
330f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
331f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
332f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
333f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
334f1af5d2fSBarry Smith     v += 36;
335f1af5d2fSBarry Smith 
336f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
337f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
338f1af5d2fSBarry Smith     while (nz--) {
339f1af5d2fSBarry Smith       oidx = 6*(*vi++);
340f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
341f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
342f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
343f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
344f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
345f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
346f1af5d2fSBarry Smith       v  += 36;
347f1af5d2fSBarry Smith     }
348f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
349f1af5d2fSBarry Smith     x[5+idx] = s6;
350f1af5d2fSBarry Smith     idx += 6;
351f1af5d2fSBarry Smith   }
352f1af5d2fSBarry Smith   /* backward solve the L^T */
353f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
354f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
355f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
356f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
357f1af5d2fSBarry Smith     idt  = 6*i;
358f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
359f1af5d2fSBarry Smith     s6 = x[5+idt];
360f1af5d2fSBarry Smith     while (nz--) {
361f1af5d2fSBarry Smith       idx   = 6*(*vi--);
362f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
363f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
364f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
365f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
366f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
367f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
368f1af5d2fSBarry Smith       v -= 36;
369f1af5d2fSBarry Smith     }
370f1af5d2fSBarry Smith   }
371b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
372b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
373b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
374f1af5d2fSBarry Smith   PetscFunctionReturn(0);
375f1af5d2fSBarry Smith }
376f1af5d2fSBarry Smith 
3774a2ae208SSatish Balay #undef __FUNCT__
3784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
3797c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
380f1af5d2fSBarry Smith {
381f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
382f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
383f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
384f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
38587828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
38687828ca2SBarry Smith   PetscScalar     *x,*b;
387f1af5d2fSBarry Smith 
388f1af5d2fSBarry Smith   PetscFunctionBegin;
389ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
390b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
391b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
392f1af5d2fSBarry Smith 
393f1af5d2fSBarry Smith   /* forward solve the U^T */
394f1af5d2fSBarry Smith   idx = 0;
395f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
396f1af5d2fSBarry Smith 
397f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
398f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
399ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
400ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
401f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
402f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
403f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
404f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
405f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
406f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
407f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
408f1af5d2fSBarry Smith     v += 49;
409f1af5d2fSBarry Smith 
410f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
411f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
412f1af5d2fSBarry Smith     while (nz--) {
413f1af5d2fSBarry Smith       oidx = 7*(*vi++);
414f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
415f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
416f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
417f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
418f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
419f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
420f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
421f1af5d2fSBarry Smith       v  += 49;
422f1af5d2fSBarry Smith     }
423f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
424f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
425f1af5d2fSBarry Smith     idx += 7;
426f1af5d2fSBarry Smith   }
427f1af5d2fSBarry Smith   /* backward solve the L^T */
428f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
429f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
430f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
431f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
432f1af5d2fSBarry Smith     idt  = 7*i;
433f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
434f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
435f1af5d2fSBarry Smith     while (nz--) {
436f1af5d2fSBarry Smith       idx   = 7*(*vi--);
437f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
438f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
439f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
440f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
441f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
442f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
443f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
444f1af5d2fSBarry Smith       v -= 49;
445f1af5d2fSBarry Smith     }
446f1af5d2fSBarry Smith   }
447b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
448b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
449b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
450f1af5d2fSBarry Smith   PetscFunctionReturn(0);
451f1af5d2fSBarry Smith }
452f1af5d2fSBarry Smith 
453f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4544a2ae208SSatish Balay #undef __FUNCT__
4554a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
4567c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
457f1af5d2fSBarry Smith {
458f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
459f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
460f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
461f1af5d2fSBarry Smith   int             *diag = a->diag;
462f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
46387828ca2SBarry Smith   PetscScalar     s1,*x,*b,*t;
464f1af5d2fSBarry Smith 
465f1af5d2fSBarry Smith   PetscFunctionBegin;
466b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
467b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
468f1af5d2fSBarry Smith   t  = a->solve_work;
469f1af5d2fSBarry Smith 
470f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
471f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
472f1af5d2fSBarry Smith 
473f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
474f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
475f1af5d2fSBarry Smith     t[i] = b[c[i]];
476f1af5d2fSBarry Smith   }
477f1af5d2fSBarry Smith 
478f1af5d2fSBarry Smith   /* forward solve the U^T */
479f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
480f1af5d2fSBarry Smith 
481f1af5d2fSBarry Smith     v     = aa + diag[i];
482f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
483f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
484f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
485f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
486f1af5d2fSBarry Smith     while (nz--) {
487f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
488f1af5d2fSBarry Smith     }
489f1af5d2fSBarry Smith     t[i]   = s1;
490f1af5d2fSBarry Smith   }
491f1af5d2fSBarry Smith   /* backward solve the L^T */
492f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
493f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
494f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
495f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
496f1af5d2fSBarry Smith     s1   = t[i];
497f1af5d2fSBarry Smith     while (nz--) {
498f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
499f1af5d2fSBarry Smith     }
500f1af5d2fSBarry Smith   }
501f1af5d2fSBarry Smith 
502f1af5d2fSBarry Smith   /* copy t into x according to permutation */
503f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
504f1af5d2fSBarry Smith     x[r[i]]   = t[i];
505f1af5d2fSBarry Smith   }
506f1af5d2fSBarry Smith 
507f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
508f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
509b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
510b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
511b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
512f1af5d2fSBarry Smith   PetscFunctionReturn(0);
513f1af5d2fSBarry Smith }
514f1af5d2fSBarry Smith 
5154a2ae208SSatish Balay #undef __FUNCT__
5164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
5177c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
518f1af5d2fSBarry Smith {
519f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
520f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
521f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
522f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
523f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
52487828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
52587828ca2SBarry Smith   PetscScalar     *x,*b,*t;
526f1af5d2fSBarry Smith 
527f1af5d2fSBarry Smith   PetscFunctionBegin;
528b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
529b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
530f1af5d2fSBarry Smith   t  = a->solve_work;
531f1af5d2fSBarry Smith 
532f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
533f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
534f1af5d2fSBarry Smith 
535f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
536f1af5d2fSBarry Smith   ii = 0;
537f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
538f1af5d2fSBarry Smith     ic      = 2*c[i];
539f1af5d2fSBarry Smith     t[ii]   = b[ic];
540f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
541f1af5d2fSBarry Smith     ii += 2;
542f1af5d2fSBarry Smith   }
543f1af5d2fSBarry Smith 
544f1af5d2fSBarry Smith   /* forward solve the U^T */
545f1af5d2fSBarry Smith   idx = 0;
546f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
547f1af5d2fSBarry Smith 
548f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
549f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
550f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
551f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
552f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
553f1af5d2fSBarry Smith     v += 4;
554f1af5d2fSBarry Smith 
555f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
556f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
557f1af5d2fSBarry Smith     while (nz--) {
558f1af5d2fSBarry Smith       oidx = 2*(*vi++);
559f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
560f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
561f1af5d2fSBarry Smith       v  += 4;
562f1af5d2fSBarry Smith     }
563f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
564f1af5d2fSBarry Smith     idx += 2;
565f1af5d2fSBarry Smith   }
566f1af5d2fSBarry Smith   /* backward solve the L^T */
567f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
568f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
569f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
570f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
571f1af5d2fSBarry Smith     idt  = 2*i;
572f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
573f1af5d2fSBarry Smith     while (nz--) {
574f1af5d2fSBarry Smith       idx   = 2*(*vi--);
575f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
576f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
577f1af5d2fSBarry Smith       v -= 4;
578f1af5d2fSBarry Smith     }
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith 
581f1af5d2fSBarry Smith   /* copy t into x according to permutation */
582f1af5d2fSBarry Smith   ii = 0;
583f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
584f1af5d2fSBarry Smith     ir      = 2*r[i];
585f1af5d2fSBarry Smith     x[ir]   = t[ii];
586f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
587f1af5d2fSBarry Smith     ii += 2;
588f1af5d2fSBarry Smith   }
589f1af5d2fSBarry Smith 
590f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
591f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
592b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
593b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
594b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
595f1af5d2fSBarry Smith   PetscFunctionReturn(0);
596f1af5d2fSBarry Smith }
597f1af5d2fSBarry Smith 
5984a2ae208SSatish Balay #undef __FUNCT__
5994a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
6007c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
601f1af5d2fSBarry Smith {
602f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
603f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
604f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
605f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
606f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
60787828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
60887828ca2SBarry Smith   PetscScalar     *x,*b,*t;
609f1af5d2fSBarry Smith 
610f1af5d2fSBarry Smith   PetscFunctionBegin;
611b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
612b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
613f1af5d2fSBarry Smith   t  = a->solve_work;
614f1af5d2fSBarry Smith 
615f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
616f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
617f1af5d2fSBarry Smith 
618f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
619f1af5d2fSBarry Smith   ii = 0;
620f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
621f1af5d2fSBarry Smith     ic      = 3*c[i];
622f1af5d2fSBarry Smith     t[ii]   = b[ic];
623f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
624f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
625f1af5d2fSBarry Smith     ii += 3;
626f1af5d2fSBarry Smith   }
627f1af5d2fSBarry Smith 
628f1af5d2fSBarry Smith   /* forward solve the U^T */
629f1af5d2fSBarry Smith   idx = 0;
630f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
631f1af5d2fSBarry Smith 
632f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
633f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
634f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
635f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
636f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
637f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
638f1af5d2fSBarry Smith     v += 9;
639f1af5d2fSBarry Smith 
640f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
641f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
642f1af5d2fSBarry Smith     while (nz--) {
643f1af5d2fSBarry Smith       oidx = 3*(*vi++);
644f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
645f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
646f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
647f1af5d2fSBarry Smith       v  += 9;
648f1af5d2fSBarry Smith     }
649f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
650f1af5d2fSBarry Smith     idx += 3;
651f1af5d2fSBarry Smith   }
652f1af5d2fSBarry Smith   /* backward solve the L^T */
653f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
654f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
655f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
656f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
657f1af5d2fSBarry Smith     idt  = 3*i;
658f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
659f1af5d2fSBarry Smith     while (nz--) {
660f1af5d2fSBarry Smith       idx   = 3*(*vi--);
661f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
662f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
663f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
664f1af5d2fSBarry Smith       v -= 9;
665f1af5d2fSBarry Smith     }
666f1af5d2fSBarry Smith   }
667f1af5d2fSBarry Smith 
668f1af5d2fSBarry Smith   /* copy t into x according to permutation */
669f1af5d2fSBarry Smith   ii = 0;
670f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
671f1af5d2fSBarry Smith     ir      = 3*r[i];
672f1af5d2fSBarry Smith     x[ir]   = t[ii];
673f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
674f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
675f1af5d2fSBarry Smith     ii += 3;
676f1af5d2fSBarry Smith   }
677f1af5d2fSBarry Smith 
678f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
679f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
680b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
681b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
682b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
683f1af5d2fSBarry Smith   PetscFunctionReturn(0);
684f1af5d2fSBarry Smith }
685f1af5d2fSBarry Smith 
6864a2ae208SSatish Balay #undef __FUNCT__
6874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
6887c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
689f1af5d2fSBarry Smith {
690f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
691f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
692f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
693f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
694f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
69587828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
69687828ca2SBarry Smith   PetscScalar     *x,*b,*t;
697f1af5d2fSBarry Smith 
698f1af5d2fSBarry Smith   PetscFunctionBegin;
699b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
700b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
701f1af5d2fSBarry Smith   t  = a->solve_work;
702f1af5d2fSBarry Smith 
703f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
704f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
705f1af5d2fSBarry Smith 
706f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
707f1af5d2fSBarry Smith   ii = 0;
708f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
709f1af5d2fSBarry Smith     ic      = 4*c[i];
710f1af5d2fSBarry Smith     t[ii]   = b[ic];
711f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
712f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
713f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
714f1af5d2fSBarry Smith     ii += 4;
715f1af5d2fSBarry Smith   }
716f1af5d2fSBarry Smith 
717f1af5d2fSBarry Smith   /* forward solve the U^T */
718f1af5d2fSBarry Smith   idx = 0;
719f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
722f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
723f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
724f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
725f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
726f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
727f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
728f1af5d2fSBarry Smith     v += 16;
729f1af5d2fSBarry Smith 
730f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
731f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
732f1af5d2fSBarry Smith     while (nz--) {
733f1af5d2fSBarry Smith       oidx = 4*(*vi++);
734f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
735f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
736f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
737f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
738f1af5d2fSBarry Smith       v  += 16;
739f1af5d2fSBarry Smith     }
740f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
741f1af5d2fSBarry Smith     idx += 4;
742f1af5d2fSBarry Smith   }
743f1af5d2fSBarry Smith   /* backward solve the L^T */
744f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
745f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
746f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
747f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
748f1af5d2fSBarry Smith     idt  = 4*i;
749f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       idx   = 4*(*vi--);
752f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v -= 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith   }
759f1af5d2fSBarry Smith 
760f1af5d2fSBarry Smith   /* copy t into x according to permutation */
761f1af5d2fSBarry Smith   ii = 0;
762f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
763f1af5d2fSBarry Smith     ir      = 4*r[i];
764f1af5d2fSBarry Smith     x[ir]   = t[ii];
765f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
766f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
767f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
768f1af5d2fSBarry Smith     ii += 4;
769f1af5d2fSBarry Smith   }
770f1af5d2fSBarry Smith 
771f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
772f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
773b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
774b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
775b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
776f1af5d2fSBarry Smith   PetscFunctionReturn(0);
777f1af5d2fSBarry Smith }
778f1af5d2fSBarry Smith 
7794a2ae208SSatish Balay #undef __FUNCT__
7804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
7817c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
782f1af5d2fSBarry Smith {
783f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
784f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
785f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
786f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
787f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
78887828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
78987828ca2SBarry Smith   PetscScalar     *x,*b,*t;
790f1af5d2fSBarry Smith 
791f1af5d2fSBarry Smith   PetscFunctionBegin;
792b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
793b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   t  = a->solve_work;
795f1af5d2fSBarry Smith 
796f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
797f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
798f1af5d2fSBarry Smith 
799f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
800f1af5d2fSBarry Smith   ii = 0;
801f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
802f1af5d2fSBarry Smith     ic      = 5*c[i];
803f1af5d2fSBarry Smith     t[ii]   = b[ic];
804f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
805f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
806f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
807f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
808f1af5d2fSBarry Smith     ii += 5;
809f1af5d2fSBarry Smith   }
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   /* forward solve the U^T */
812f1af5d2fSBarry Smith   idx = 0;
813f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
814f1af5d2fSBarry Smith 
815f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
816f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
817f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
818f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
819f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
820f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
821f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
822f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
823f1af5d2fSBarry Smith     v += 25;
824f1af5d2fSBarry Smith 
825f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
826f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
827f1af5d2fSBarry Smith     while (nz--) {
828f1af5d2fSBarry Smith       oidx = 5*(*vi++);
829f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
830f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
831f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
832f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
833f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
834f1af5d2fSBarry Smith       v  += 25;
835f1af5d2fSBarry Smith     }
836f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
837f1af5d2fSBarry Smith     idx += 5;
838f1af5d2fSBarry Smith   }
839f1af5d2fSBarry Smith   /* backward solve the L^T */
840f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
841f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
842f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
843f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
844f1af5d2fSBarry Smith     idt  = 5*i;
845f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
846f1af5d2fSBarry Smith     while (nz--) {
847f1af5d2fSBarry Smith       idx   = 5*(*vi--);
848f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
849f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
850f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
851f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
852f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
853f1af5d2fSBarry Smith       v -= 25;
854f1af5d2fSBarry Smith     }
855f1af5d2fSBarry Smith   }
856f1af5d2fSBarry Smith 
857f1af5d2fSBarry Smith   /* copy t into x according to permutation */
858f1af5d2fSBarry Smith   ii = 0;
859f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
860f1af5d2fSBarry Smith     ir      = 5*r[i];
861f1af5d2fSBarry Smith     x[ir]   = t[ii];
862f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
863f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
864f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
865f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
866f1af5d2fSBarry Smith     ii += 5;
867f1af5d2fSBarry Smith   }
868f1af5d2fSBarry Smith 
869f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
870f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
871b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
872b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
873b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
874f1af5d2fSBarry Smith   PetscFunctionReturn(0);
875f1af5d2fSBarry Smith }
876f1af5d2fSBarry Smith 
8774a2ae208SSatish Balay #undef __FUNCT__
8784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
8797c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
880f1af5d2fSBarry Smith {
881f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
882f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
883f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
884f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
885f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
88687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
88787828ca2SBarry Smith   PetscScalar     *x,*b,*t;
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   PetscFunctionBegin;
890b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
891b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
892f1af5d2fSBarry Smith   t  = a->solve_work;
893f1af5d2fSBarry Smith 
894f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
895f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
896f1af5d2fSBarry Smith 
897f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
898f1af5d2fSBarry Smith   ii = 0;
899f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
900f1af5d2fSBarry Smith     ic      = 6*c[i];
901f1af5d2fSBarry Smith     t[ii]   = b[ic];
902f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
903f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
904f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
905f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
906f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
907f1af5d2fSBarry Smith     ii += 6;
908f1af5d2fSBarry Smith   }
909f1af5d2fSBarry Smith 
910f1af5d2fSBarry Smith   /* forward solve the U^T */
911f1af5d2fSBarry Smith   idx = 0;
912f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
913f1af5d2fSBarry Smith 
914f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
915f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
916f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
917f1af5d2fSBarry Smith     x6    = t[5+idx];
918f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
919f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
920f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
921f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
922f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
923f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
924f1af5d2fSBarry Smith     v += 36;
925f1af5d2fSBarry Smith 
926f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
927f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
928f1af5d2fSBarry Smith     while (nz--) {
929f1af5d2fSBarry Smith       oidx = 6*(*vi++);
930f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
931f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
932f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
933f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
934f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
935f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
936f1af5d2fSBarry Smith       v  += 36;
937f1af5d2fSBarry Smith     }
938f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
939f1af5d2fSBarry Smith     t[5+idx] = s6;
940f1af5d2fSBarry Smith     idx += 6;
941f1af5d2fSBarry Smith   }
942f1af5d2fSBarry Smith   /* backward solve the L^T */
943f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
944f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
945f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
946f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
947f1af5d2fSBarry Smith     idt  = 6*i;
948f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
949f1af5d2fSBarry Smith     s6 = t[5+idt];
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       idx   = 6*(*vi--);
952f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v -= 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith   }
961f1af5d2fSBarry Smith 
962f1af5d2fSBarry Smith   /* copy t into x according to permutation */
963f1af5d2fSBarry Smith   ii = 0;
964f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
965f1af5d2fSBarry Smith     ir      = 6*r[i];
966f1af5d2fSBarry Smith     x[ir]   = t[ii];
967f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
968f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
969f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
970f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
971f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
972f1af5d2fSBarry Smith     ii += 6;
973f1af5d2fSBarry Smith   }
974f1af5d2fSBarry Smith 
975f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
976f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
977b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
978b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
979b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
980f1af5d2fSBarry Smith   PetscFunctionReturn(0);
981f1af5d2fSBarry Smith }
982f1af5d2fSBarry Smith 
9834a2ae208SSatish Balay #undef __FUNCT__
9844a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
9857c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
986f1af5d2fSBarry Smith {
987f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
988f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
989f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
990f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
991f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
99287828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
99387828ca2SBarry Smith   PetscScalar     *x,*b,*t;
994f1af5d2fSBarry Smith 
995f1af5d2fSBarry Smith   PetscFunctionBegin;
996b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
997b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   t  = a->solve_work;
999f1af5d2fSBarry Smith 
1000f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1001f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1002f1af5d2fSBarry Smith 
1003f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1004f1af5d2fSBarry Smith   ii = 0;
1005f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1006f1af5d2fSBarry Smith     ic      = 7*c[i];
1007f1af5d2fSBarry Smith     t[ii]   = b[ic];
1008f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1009f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1010f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1011f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1012f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1013f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1014f1af5d2fSBarry Smith     ii += 7;
1015f1af5d2fSBarry Smith   }
1016f1af5d2fSBarry Smith 
1017f1af5d2fSBarry Smith   /* forward solve the U^T */
1018f1af5d2fSBarry Smith   idx = 0;
1019f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1020f1af5d2fSBarry Smith 
1021f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1022f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1023f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1024f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1025f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1026f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1027f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1028f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1029f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1030f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1031f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1032f1af5d2fSBarry Smith     v += 49;
1033f1af5d2fSBarry Smith 
1034f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1035f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1036f1af5d2fSBarry Smith     while (nz--) {
1037f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1038f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1039f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1040f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1041f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1042f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1043f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1044f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1045f1af5d2fSBarry Smith       v  += 49;
1046f1af5d2fSBarry Smith     }
1047f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1048f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1049f1af5d2fSBarry Smith     idx += 7;
1050f1af5d2fSBarry Smith   }
1051f1af5d2fSBarry Smith   /* backward solve the L^T */
1052f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1053f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1054f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1055f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1056f1af5d2fSBarry Smith     idt  = 7*i;
1057f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1058f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1059f1af5d2fSBarry Smith     while (nz--) {
1060f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1061f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1062f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1063f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1064f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1065f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1066f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1067f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1068f1af5d2fSBarry Smith       v -= 49;
1069f1af5d2fSBarry Smith     }
1070f1af5d2fSBarry Smith   }
1071f1af5d2fSBarry Smith 
1072f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1073f1af5d2fSBarry Smith   ii = 0;
1074f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1075f1af5d2fSBarry Smith     ir      = 7*r[i];
1076f1af5d2fSBarry Smith     x[ir]   = t[ii];
1077f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1078f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1079f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1080f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1081f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1082f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1083f1af5d2fSBarry Smith     ii += 7;
1084f1af5d2fSBarry Smith   }
1085f1af5d2fSBarry Smith 
1086f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1087f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1088b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1089b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1090b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1091f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1092f1af5d2fSBarry Smith }
1093f1af5d2fSBarry Smith 
10944e2b4712SSatish Balay /* ----------------------------------------------------------- */
10954a2ae208SSatish Balay #undef __FUNCT__
10964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
10974e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
10984e2b4712SSatish Balay {
10994e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11004e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11014e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
11024e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
11033f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
110487828ca2SBarry Smith   PetscScalar     *x,*b,*s,*t,*ls;
11054e2b4712SSatish Balay 
11064e2b4712SSatish Balay   PetscFunctionBegin;
1107b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1108b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1109f1af5d2fSBarry Smith   t  = a->solve_work;
11104e2b4712SSatish Balay 
11114e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11124e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11134e2b4712SSatish Balay 
11144e2b4712SSatish Balay   /* forward solve the lower triangular */
111587828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11164e2b4712SSatish Balay   for (i=1; i<n; i++) {
11174e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11184e2b4712SSatish Balay     vi  = aj + ai[i];
11194e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1120f1af5d2fSBarry Smith     s = t + bs*i;
112187828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11224e2b4712SSatish Balay     while (nz--) {
1123f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11244e2b4712SSatish Balay       v += bs2;
11254e2b4712SSatish Balay     }
11264e2b4712SSatish Balay   }
11274e2b4712SSatish Balay   /* backward solve the upper triangular */
1128273d9f13SBarry Smith   ls = a->solve_work + A->n;
11294e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11304e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11314e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11324e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
113387828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11344e2b4712SSatish Balay     while (nz--) {
1135f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11364e2b4712SSatish Balay       v += bs2;
11374e2b4712SSatish Balay     }
1138f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
113987828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11404e2b4712SSatish Balay   }
11414e2b4712SSatish Balay 
11424e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11434e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1144b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1145b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1146b0a32e0cSBarry Smith   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
11474e2b4712SSatish Balay   PetscFunctionReturn(0);
11484e2b4712SSatish Balay }
11494e2b4712SSatish Balay 
11504a2ae208SSatish Balay #undef __FUNCT__
11514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
11524e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11534e2b4712SSatish Balay {
11544e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11554e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11564e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
11574e2b4712SSatish Balay   int             *diag = a->diag;
11583f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
115987828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
116087828ca2SBarry Smith   PetscScalar     *x,*b,*t;
11614e2b4712SSatish Balay 
11624e2b4712SSatish Balay   PetscFunctionBegin;
1163b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1164b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1165f1af5d2fSBarry Smith   t  = a->solve_work;
11664e2b4712SSatish Balay 
11674e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11684e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11694e2b4712SSatish Balay 
11704e2b4712SSatish Balay   /* forward solve the lower triangular */
11714e2b4712SSatish Balay   idx    = 7*(*r++);
1172f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1173f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1174f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11754e2b4712SSatish Balay 
11764e2b4712SSatish Balay   for (i=1; i<n; i++) {
11774e2b4712SSatish Balay     v     = aa + 49*ai[i];
11784e2b4712SSatish Balay     vi    = aj + ai[i];
11794e2b4712SSatish Balay     nz    = diag[i] - ai[i];
11804e2b4712SSatish Balay     idx   = 7*(*r++);
1181f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1182f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
11834e2b4712SSatish Balay     while (nz--) {
11844e2b4712SSatish Balay       idx   = 7*(*vi++);
1185f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1186f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1187f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1188f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1189f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1190f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1191f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1192f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1193f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1194f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
11954e2b4712SSatish Balay       v += 49;
11964e2b4712SSatish Balay     }
11974e2b4712SSatish Balay     idx = 7*i;
1198f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1199f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1200f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12014e2b4712SSatish Balay   }
12024e2b4712SSatish Balay   /* backward solve the upper triangular */
12034e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12044e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12054e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12064e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12074e2b4712SSatish Balay     idt  = 7*i;
1208f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1209f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1210f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12114e2b4712SSatish Balay     while (nz--) {
12124e2b4712SSatish Balay       idx   = 7*(*vi++);
1213f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1214f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1215f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1216f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1217f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1218f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1219f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1220f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1221f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1222f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12234e2b4712SSatish Balay       v += 49;
12244e2b4712SSatish Balay     }
12254e2b4712SSatish Balay     idc = 7*(*c--);
12264e2b4712SSatish Balay     v   = aa + 49*diag[i];
1227f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1228f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1229f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1230f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1231f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1232f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1233f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1234f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1235f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1236f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1237f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1238f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1239f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1240f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12414e2b4712SSatish Balay   }
12424e2b4712SSatish Balay 
12434e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12444e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1245b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1246b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1247b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
12484e2b4712SSatish Balay   PetscFunctionReturn(0);
12494e2b4712SSatish Balay }
12504e2b4712SSatish Balay 
12514a2ae208SSatish Balay #undef __FUNCT__
12524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
125315091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
125415091d37SBarry Smith {
125515091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
125615091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
125715091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
125815091d37SBarry Smith   MatScalar       *aa=a->a,*v;
125987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
126015091d37SBarry Smith 
126115091d37SBarry Smith   PetscFunctionBegin;
1262b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1263b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
126415091d37SBarry Smith   /* forward solve the lower triangular */
126515091d37SBarry Smith   idx    = 0;
126615091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
126715091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
126815091d37SBarry Smith   x[6] = b[6+idx];
126915091d37SBarry Smith   for (i=1; i<n; i++) {
127015091d37SBarry Smith     v     =  aa + 49*ai[i];
127115091d37SBarry Smith     vi    =  aj + ai[i];
127215091d37SBarry Smith     nz    =  diag[i] - ai[i];
127315091d37SBarry Smith     idx   =  7*i;
1274f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1275f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1276f1af5d2fSBarry Smith     s7  =  b[6+idx];
127715091d37SBarry Smith     while (nz--) {
127815091d37SBarry Smith       jdx   = 7*(*vi++);
127915091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
128015091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
128115091d37SBarry Smith       x7    = x[6+jdx];
1282f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1283f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1284f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1285f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1286f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1287f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1288f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
128915091d37SBarry Smith       v += 49;
129015091d37SBarry Smith      }
1291f1af5d2fSBarry Smith     x[idx]   = s1;
1292f1af5d2fSBarry Smith     x[1+idx] = s2;
1293f1af5d2fSBarry Smith     x[2+idx] = s3;
1294f1af5d2fSBarry Smith     x[3+idx] = s4;
1295f1af5d2fSBarry Smith     x[4+idx] = s5;
1296f1af5d2fSBarry Smith     x[5+idx] = s6;
1297f1af5d2fSBarry Smith     x[6+idx] = s7;
129815091d37SBarry Smith   }
129915091d37SBarry Smith   /* backward solve the upper triangular */
130015091d37SBarry Smith   for (i=n-1; i>=0; i--){
130115091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
130215091d37SBarry Smith     vi   = aj + diag[i] + 1;
130315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
130415091d37SBarry Smith     idt  = 7*i;
1305f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1306f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1307f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1308f1af5d2fSBarry Smith     s7 = x[6+idt];
130915091d37SBarry Smith     while (nz--) {
131015091d37SBarry Smith       idx   = 7*(*vi++);
131115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
131215091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
131315091d37SBarry Smith       x7    = x[6+idx];
1314f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1315f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1316f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1317f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1318f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1319f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1320f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
132115091d37SBarry Smith       v += 49;
132215091d37SBarry Smith     }
132315091d37SBarry Smith     v        = aa + 49*diag[i];
1324f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1325f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1326f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1327f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1328f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1329f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1330f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1331f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1332f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1333f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1334f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1335f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1336f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1337f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
133815091d37SBarry Smith   }
133915091d37SBarry Smith 
1340b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1341b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1342b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
134315091d37SBarry Smith   PetscFunctionReturn(0);
134415091d37SBarry Smith }
134515091d37SBarry Smith 
13464a2ae208SSatish Balay #undef __FUNCT__
13474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
134815091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
134915091d37SBarry Smith {
135015091d37SBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
135115091d37SBarry Smith   IS              iscol=a->col,isrow=a->row;
135215091d37SBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
135315091d37SBarry Smith   int             *diag = a->diag;
135415091d37SBarry Smith   MatScalar       *aa=a->a,*v;
135587828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
135615091d37SBarry Smith 
135715091d37SBarry Smith   PetscFunctionBegin;
1358b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1359b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1360f1af5d2fSBarry Smith   t  = a->solve_work;
136115091d37SBarry Smith 
136215091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
136315091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
136415091d37SBarry Smith 
136515091d37SBarry Smith   /* forward solve the lower triangular */
136615091d37SBarry Smith   idx    = 6*(*r++);
1367f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1368f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1369f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
137015091d37SBarry Smith   for (i=1; i<n; i++) {
137115091d37SBarry Smith     v     = aa + 36*ai[i];
137215091d37SBarry Smith     vi    = aj + ai[i];
137315091d37SBarry Smith     nz    = diag[i] - ai[i];
137415091d37SBarry Smith     idx   = 6*(*r++);
1375f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1376f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
137715091d37SBarry Smith     while (nz--) {
137815091d37SBarry Smith       idx   = 6*(*vi++);
1379f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1380f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1381f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1382f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1383f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1384f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1385f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1386f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
138715091d37SBarry Smith       v += 36;
138815091d37SBarry Smith     }
138915091d37SBarry Smith     idx = 6*i;
1390f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1391f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1392f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
139315091d37SBarry Smith   }
139415091d37SBarry Smith   /* backward solve the upper triangular */
139515091d37SBarry Smith   for (i=n-1; i>=0; i--){
139615091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
139715091d37SBarry Smith     vi   = aj + diag[i] + 1;
139815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
139915091d37SBarry Smith     idt  = 6*i;
1400f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1401f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1402f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
140315091d37SBarry Smith     while (nz--) {
140415091d37SBarry Smith       idx   = 6*(*vi++);
1405f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1406f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1407f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1408f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1409f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1410f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1411f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1412f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1413f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
141415091d37SBarry Smith       v += 36;
141515091d37SBarry Smith     }
141615091d37SBarry Smith     idc = 6*(*c--);
141715091d37SBarry Smith     v   = aa + 36*diag[i];
1418f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1419f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1420f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1421f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1422f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1423f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1424f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1425f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1426f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1427f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1428f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1429f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
143015091d37SBarry Smith   }
143115091d37SBarry Smith 
143215091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
143315091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1434b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1435b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1436b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
143715091d37SBarry Smith   PetscFunctionReturn(0);
143815091d37SBarry Smith }
143915091d37SBarry Smith 
14404a2ae208SSatish Balay #undef __FUNCT__
14414a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
144215091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
144315091d37SBarry Smith {
144415091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
144515091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
144615091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
144715091d37SBarry Smith   MatScalar       *aa=a->a,*v;
144887828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
144915091d37SBarry Smith 
145015091d37SBarry Smith   PetscFunctionBegin;
1451b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1452b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
145315091d37SBarry Smith   /* forward solve the lower triangular */
145415091d37SBarry Smith   idx    = 0;
145515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
145615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
145715091d37SBarry Smith   for (i=1; i<n; i++) {
145815091d37SBarry Smith     v     =  aa + 36*ai[i];
145915091d37SBarry Smith     vi    =  aj + ai[i];
146015091d37SBarry Smith     nz    =  diag[i] - ai[i];
146115091d37SBarry Smith     idx   =  6*i;
1462f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1463f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
146415091d37SBarry Smith     while (nz--) {
146515091d37SBarry Smith       jdx   = 6*(*vi++);
146615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
146715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1468f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1469f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1470f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1471f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1472f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1473f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
147415091d37SBarry Smith       v += 36;
147515091d37SBarry Smith      }
1476f1af5d2fSBarry Smith     x[idx]   = s1;
1477f1af5d2fSBarry Smith     x[1+idx] = s2;
1478f1af5d2fSBarry Smith     x[2+idx] = s3;
1479f1af5d2fSBarry Smith     x[3+idx] = s4;
1480f1af5d2fSBarry Smith     x[4+idx] = s5;
1481f1af5d2fSBarry Smith     x[5+idx] = s6;
148215091d37SBarry Smith   }
148315091d37SBarry Smith   /* backward solve the upper triangular */
148415091d37SBarry Smith   for (i=n-1; i>=0; i--){
148515091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
148615091d37SBarry Smith     vi   = aj + diag[i] + 1;
148715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
148815091d37SBarry Smith     idt  = 6*i;
1489f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1490f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1491f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
149215091d37SBarry Smith     while (nz--) {
149315091d37SBarry Smith       idx   = 6*(*vi++);
149415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
149515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1496f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1497f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1498f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1499f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1500f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1501f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150215091d37SBarry Smith       v += 36;
150315091d37SBarry Smith     }
150415091d37SBarry Smith     v        = aa + 36*diag[i];
1505f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1506f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1507f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1508f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1509f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1510f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
151115091d37SBarry Smith   }
151215091d37SBarry Smith 
1513b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1514b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1515b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
151615091d37SBarry Smith   PetscFunctionReturn(0);
151715091d37SBarry Smith }
151815091d37SBarry Smith 
15194a2ae208SSatish Balay #undef __FUNCT__
15204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
15214e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15224e2b4712SSatish Balay {
15234e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15244e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
15254e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
15264e2b4712SSatish Balay   int             *diag = a->diag;
15273f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
152887828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
15294e2b4712SSatish Balay 
15304e2b4712SSatish Balay   PetscFunctionBegin;
1531b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1532b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1533f1af5d2fSBarry Smith   t  = a->solve_work;
15344e2b4712SSatish Balay 
15354e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15364e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15374e2b4712SSatish Balay 
15384e2b4712SSatish Balay   /* forward solve the lower triangular */
15394e2b4712SSatish Balay   idx    = 5*(*r++);
1540f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1541f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15424e2b4712SSatish Balay   for (i=1; i<n; i++) {
15434e2b4712SSatish Balay     v     = aa + 25*ai[i];
15444e2b4712SSatish Balay     vi    = aj + ai[i];
15454e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15464e2b4712SSatish Balay     idx   = 5*(*r++);
1547f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1548f1af5d2fSBarry Smith     s5  = b[4+idx];
15494e2b4712SSatish Balay     while (nz--) {
15504e2b4712SSatish Balay       idx   = 5*(*vi++);
1551f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1552f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1553f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1554f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1555f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1556f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1557f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15584e2b4712SSatish Balay       v += 25;
15594e2b4712SSatish Balay     }
15604e2b4712SSatish Balay     idx = 5*i;
1561f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1562f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15634e2b4712SSatish Balay   }
15644e2b4712SSatish Balay   /* backward solve the upper triangular */
15654e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15664e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15674e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15684e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
15694e2b4712SSatish Balay     idt  = 5*i;
1570f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1571f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
15724e2b4712SSatish Balay     while (nz--) {
15734e2b4712SSatish Balay       idx   = 5*(*vi++);
1574f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1575f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1576f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1577f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1578f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1579f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1580f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15814e2b4712SSatish Balay       v += 25;
15824e2b4712SSatish Balay     }
15834e2b4712SSatish Balay     idc = 5*(*c--);
15844e2b4712SSatish Balay     v   = aa + 25*diag[i];
1585f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1586f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1587f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1588f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1589f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1590f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1591f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1592f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1593f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1594f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
15954e2b4712SSatish Balay   }
15964e2b4712SSatish Balay 
15974e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
15984e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1599b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1600b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1601b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
16024e2b4712SSatish Balay   PetscFunctionReturn(0);
16034e2b4712SSatish Balay }
16044e2b4712SSatish Balay 
16054a2ae208SSatish Balay #undef __FUNCT__
16064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
160715091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
160815091d37SBarry Smith {
160915091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
161015091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
161115091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
161215091d37SBarry Smith   MatScalar       *aa=a->a,*v;
161387828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
161415091d37SBarry Smith 
161515091d37SBarry Smith   PetscFunctionBegin;
1616b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1617b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
161815091d37SBarry Smith   /* forward solve the lower triangular */
161915091d37SBarry Smith   idx    = 0;
162015091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
162115091d37SBarry Smith   for (i=1; i<n; i++) {
162215091d37SBarry Smith     v     =  aa + 25*ai[i];
162315091d37SBarry Smith     vi    =  aj + ai[i];
162415091d37SBarry Smith     nz    =  diag[i] - ai[i];
162515091d37SBarry Smith     idx   =  5*i;
1626f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
162715091d37SBarry Smith     while (nz--) {
162815091d37SBarry Smith       jdx   = 5*(*vi++);
162915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1630f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1631f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1632f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1633f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1634f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
163515091d37SBarry Smith       v    += 25;
163615091d37SBarry Smith     }
1637f1af5d2fSBarry Smith     x[idx]   = s1;
1638f1af5d2fSBarry Smith     x[1+idx] = s2;
1639f1af5d2fSBarry Smith     x[2+idx] = s3;
1640f1af5d2fSBarry Smith     x[3+idx] = s4;
1641f1af5d2fSBarry Smith     x[4+idx] = s5;
164215091d37SBarry Smith   }
164315091d37SBarry Smith   /* backward solve the upper triangular */
164415091d37SBarry Smith   for (i=n-1; i>=0; i--){
164515091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
164615091d37SBarry Smith     vi   = aj + diag[i] + 1;
164715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
164815091d37SBarry Smith     idt  = 5*i;
1649f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1650f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
165115091d37SBarry Smith     while (nz--) {
165215091d37SBarry Smith       idx   = 5*(*vi++);
165315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1654f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1655f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1656f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1657f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1658f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
165915091d37SBarry Smith       v    += 25;
166015091d37SBarry Smith     }
166115091d37SBarry Smith     v        = aa + 25*diag[i];
1662f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1663f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1664f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1665f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1666f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
166715091d37SBarry Smith   }
166815091d37SBarry Smith 
1669b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1670b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1671b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
167215091d37SBarry Smith   PetscFunctionReturn(0);
167315091d37SBarry Smith }
167415091d37SBarry Smith 
16754a2ae208SSatish Balay #undef __FUNCT__
16764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
16774e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
16784e2b4712SSatish Balay {
16794e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
16804e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
16814e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
16824e2b4712SSatish Balay   int             *diag = a->diag;
16833f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
168487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
16854e2b4712SSatish Balay 
16864e2b4712SSatish Balay   PetscFunctionBegin;
1687b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1688b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1689f1af5d2fSBarry Smith   t  = a->solve_work;
16904e2b4712SSatish Balay 
16914e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
16924e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
16934e2b4712SSatish Balay 
16944e2b4712SSatish Balay   /* forward solve the lower triangular */
16954e2b4712SSatish Balay   idx    = 4*(*r++);
1696f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1697f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
16984e2b4712SSatish Balay   for (i=1; i<n; i++) {
16994e2b4712SSatish Balay     v     = aa + 16*ai[i];
17004e2b4712SSatish Balay     vi    = aj + ai[i];
17014e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17024e2b4712SSatish Balay     idx   = 4*(*r++);
1703f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17044e2b4712SSatish Balay     while (nz--) {
17054e2b4712SSatish Balay       idx   = 4*(*vi++);
1706f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1707f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1708f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1709f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1710f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17114e2b4712SSatish Balay       v    += 16;
17124e2b4712SSatish Balay     }
17134e2b4712SSatish Balay     idx        = 4*i;
1714f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1715f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17164e2b4712SSatish Balay   }
17174e2b4712SSatish Balay   /* backward solve the upper triangular */
17184e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17194e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17204e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17214e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17224e2b4712SSatish Balay     idt  = 4*i;
1723f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1724f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17254e2b4712SSatish Balay     while (nz--) {
17264e2b4712SSatish Balay       idx   = 4*(*vi++);
1727f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1728f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1729f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1730f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1731f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1732f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17334e2b4712SSatish Balay       v += 16;
17344e2b4712SSatish Balay     }
17354e2b4712SSatish Balay     idc      = 4*(*c--);
17364e2b4712SSatish Balay     v        = aa + 16*diag[i];
1737f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1738f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1739f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1740f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17414e2b4712SSatish Balay   }
17424e2b4712SSatish Balay 
17434e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17444e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1745b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1746b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1747b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
17484e2b4712SSatish Balay   PetscFunctionReturn(0);
17494e2b4712SSatish Balay }
1750f26ec98cSKris Buschelman 
1751f26ec98cSKris Buschelman #undef __FUNCT__
1752f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1753f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1754f26ec98cSKris Buschelman {
1755f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1756f26ec98cSKris Buschelman   IS              iscol=a->col,isrow=a->row;
1757f26ec98cSKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1758f26ec98cSKris Buschelman   int             *diag = a->diag;
1759f26ec98cSKris Buschelman   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1760f26ec98cSKris Buschelman   PetscScalar     *x,*b;
1761f26ec98cSKris Buschelman 
1762f26ec98cSKris Buschelman   PetscFunctionBegin;
1763b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1764b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
1765f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1766f26ec98cSKris Buschelman 
1767f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1768f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1769f26ec98cSKris Buschelman 
1770f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1771f26ec98cSKris Buschelman   idx    = 4*(*r++);
1772f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1773f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1774f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1775f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1776f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1777f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1778f26ec98cSKris Buschelman     vi    = aj + ai[i];
1779f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1780f26ec98cSKris Buschelman     idx   = 4*(*r++);
1781f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1782f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1783f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1784f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1785f26ec98cSKris Buschelman     while (nz--) {
1786f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1787f26ec98cSKris Buschelman       x1  = t[idx];
1788f26ec98cSKris Buschelman       x2  = t[1+idx];
1789f26ec98cSKris Buschelman       x3  = t[2+idx];
1790f26ec98cSKris Buschelman       x4  = t[3+idx];
1791f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1792f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1793f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1794f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1795f26ec98cSKris Buschelman       v    += 16;
1796f26ec98cSKris Buschelman     }
1797f26ec98cSKris Buschelman     idx        = 4*i;
1798f26ec98cSKris Buschelman     t[idx]   = s1;
1799f26ec98cSKris Buschelman     t[1+idx] = s2;
1800f26ec98cSKris Buschelman     t[2+idx] = s3;
1801f26ec98cSKris Buschelman     t[3+idx] = s4;
1802f26ec98cSKris Buschelman   }
1803f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1804f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1805f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1806f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1807f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1808f26ec98cSKris Buschelman     idt  = 4*i;
1809f26ec98cSKris Buschelman     s1 = t[idt];
1810f26ec98cSKris Buschelman     s2 = t[1+idt];
1811f26ec98cSKris Buschelman     s3 = t[2+idt];
1812f26ec98cSKris Buschelman     s4 = t[3+idt];
1813f26ec98cSKris Buschelman     while (nz--) {
1814f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1815f26ec98cSKris Buschelman       x1  = t[idx];
1816f26ec98cSKris Buschelman       x2  = t[1+idx];
1817f26ec98cSKris Buschelman       x3  = t[2+idx];
1818f26ec98cSKris Buschelman       x4  = t[3+idx];
1819f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1820f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1821f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1822f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1823f26ec98cSKris Buschelman       v += 16;
1824f26ec98cSKris Buschelman     }
1825f26ec98cSKris Buschelman     idc      = 4*(*c--);
1826f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1827f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1828f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1829f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1830f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1831f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1832f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1833f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1834f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1835f26ec98cSKris Buschelman  }
1836f26ec98cSKris Buschelman 
1837f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1838f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1839b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
1840b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
1841f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1842f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1843f26ec98cSKris Buschelman }
1844f26ec98cSKris Buschelman 
184524c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
184624c233c2SKris Buschelman 
184724c233c2SKris Buschelman #include PETSC_HAVE_SSE
184824c233c2SKris Buschelman 
184924c233c2SKris Buschelman #undef __FUNCT__
185024c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
185124c233c2SKris Buschelman int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
185224c233c2SKris Buschelman {
185324c233c2SKris Buschelman   /*
185424c233c2SKris Buschelman      Note: This code uses demotion of double
185524c233c2SKris Buschelman      to float when performing the mixed-mode computation.
185624c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
185724c233c2SKris Buschelman   */
185824c233c2SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
185924c233c2SKris Buschelman   IS              iscol=a->col,isrow=a->row;
186024c233c2SKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
186124c233c2SKris Buschelman   int             *diag = a->diag,ai16;
186224c233c2SKris Buschelman   MatScalar       *aa=a->a,*v;
186387828ca2SBarry Smith   PetscScalar     *x,*b,*t;
186424c233c2SKris Buschelman 
186524c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
186624c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
186724c233c2SKris Buschelman   unsigned long   offset;
186824c233c2SKris Buschelman 
186924c233c2SKris Buschelman   PetscFunctionBegin;
187024c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
187124c233c2SKris Buschelman 
187224c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
187324c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
187424c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
187524c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
187624c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
187724c233c2SKris Buschelman 
1878b1d4fb26SBarry Smith     ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
1879b1d4fb26SBarry Smith     ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
188024c233c2SKris Buschelman     t  = a->solve_work;
188124c233c2SKris Buschelman 
188224c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
188324c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
188424c233c2SKris Buschelman 
188524c233c2SKris Buschelman     /* forward solve the lower triangular */
188624c233c2SKris Buschelman     idx  = 4*(*r++);
188724c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
188824c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
188924c233c2SKris Buschelman     v    =  aa + 16*ai[1];
189024c233c2SKris Buschelman 
189124c233c2SKris Buschelman     for (i=1; i<n;) {
189224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
189324c233c2SKris Buschelman       vi   =  aj      + ai[i];
189424c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
189524c233c2SKris Buschelman       idx  =  4*(*r++);
189624c233c2SKris Buschelman 
189724c233c2SKris Buschelman       /* Demote sum from double to float */
189824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
189924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
190024c233c2SKris Buschelman 
190124c233c2SKris Buschelman       while (nz--) {
190224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
190324c233c2SKris Buschelman         idx = 4*(*vi++);
190424c233c2SKris Buschelman 
190524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
190624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
190724c233c2SKris Buschelman 
190824c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
190924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
191024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
191124c233c2SKris Buschelman 
191224c233c2SKris Buschelman           /* First Column */
191324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
191424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
191524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
191624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
191724c233c2SKris Buschelman 
191824c233c2SKris Buschelman           /* Second Column */
191924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
192024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
192124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
192224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
192324c233c2SKris Buschelman 
192424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
192524c233c2SKris Buschelman 
192624c233c2SKris Buschelman           /* Third Column */
192724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
192824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
192924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
193024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
193124c233c2SKris Buschelman 
193224c233c2SKris Buschelman           /* Fourth Column */
193324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
193424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
193524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
193624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
193724c233c2SKris Buschelman         SSE_INLINE_END_2
193824c233c2SKris Buschelman 
193924c233c2SKris Buschelman         v  += 16;
194024c233c2SKris Buschelman       }
194124c233c2SKris Buschelman       idx = 4*i;
194224c233c2SKris Buschelman       v   = aa + 16*ai[++i];
194324c233c2SKris Buschelman       PREFETCH_NTA(v);
194424c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
194524c233c2SKris Buschelman 
194624c233c2SKris Buschelman       /* Promote result from float to double */
194724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
194824c233c2SKris Buschelman     }
194924c233c2SKris Buschelman     /* backward solve the upper triangular */
195024c233c2SKris Buschelman     idt  = 4*(n-1);
195124c233c2SKris Buschelman     ai16 = 16*diag[n-1];
195224c233c2SKris Buschelman     v    = aa + ai16 + 16;
195324c233c2SKris Buschelman     for (i=n-1; i>=0;){
195424c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
195524c233c2SKris Buschelman       vi = aj + diag[i] + 1;
195624c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
195724c233c2SKris Buschelman 
195824c233c2SKris Buschelman       /* Demote accumulator from double to float */
195924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
196024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
196124c233c2SKris Buschelman 
196224c233c2SKris Buschelman       while (nz--) {
196324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
196424c233c2SKris Buschelman         idx = 4*(*vi++);
196524c233c2SKris Buschelman 
196624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
196724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
196824c233c2SKris Buschelman 
196924c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
197024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
197124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
197224c233c2SKris Buschelman 
197324c233c2SKris Buschelman           /* First Column */
197424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
197524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
197624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
197724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
197824c233c2SKris Buschelman 
197924c233c2SKris Buschelman           /* Second Column */
198024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
198124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
198224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
198324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
198424c233c2SKris Buschelman 
198524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
198624c233c2SKris Buschelman 
198724c233c2SKris Buschelman           /* Third Column */
198824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
198924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
199024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
199124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
199224c233c2SKris Buschelman 
199324c233c2SKris Buschelman           /* Fourth Column */
199424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
199524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
199624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
199724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
199824c233c2SKris Buschelman         SSE_INLINE_END_2
199924c233c2SKris Buschelman         v  += 16;
200024c233c2SKris Buschelman       }
200124c233c2SKris Buschelman       v    = aa + ai16;
200224c233c2SKris Buschelman       ai16 = 16*diag[--i];
200324c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
200424c233c2SKris Buschelman       /*
200524c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
200624c233c2SKris Buschelman          which was inverted as part of the factorization
200724c233c2SKris Buschelman       */
200824c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
200924c233c2SKris Buschelman         /* First Column */
201024c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
201124c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
201224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
201324c233c2SKris Buschelman 
201424c233c2SKris Buschelman         /* Second Column */
201524c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
201624c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
201724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
201824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
201924c233c2SKris Buschelman 
202024c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
202124c233c2SKris Buschelman 
202224c233c2SKris Buschelman         /* Third Column */
202324c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
202424c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
202524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
202624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
202724c233c2SKris Buschelman 
202824c233c2SKris Buschelman         /* Fourth Column */
202924c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
203024c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
203124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
203224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
203324c233c2SKris Buschelman 
203424c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
203524c233c2SKris Buschelman       SSE_INLINE_END_3
203624c233c2SKris Buschelman 
203724c233c2SKris Buschelman       /* Promote solution from float to double */
203824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
203924c233c2SKris Buschelman 
204024c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
204124c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
204224c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
204324c233c2SKris Buschelman       idc  = 4*(*c--);
204424c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
204524c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
204624c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
204724c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
204824c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
204924c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
205024c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
205124c233c2SKris Buschelman       SSE_INLINE_END_2
205224c233c2SKris Buschelman       v    = aa + ai16 + 16;
205324c233c2SKris Buschelman       idt -= 4;
205424c233c2SKris Buschelman     }
205524c233c2SKris Buschelman 
205624c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
205724c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2058b1d4fb26SBarry Smith     ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2059b1d4fb26SBarry Smith     ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
206024c233c2SKris Buschelman     PetscLogFlops(2*16*(a->nz) - 4*A->n);
206124c233c2SKris Buschelman   SSE_SCOPE_END;
206224c233c2SKris Buschelman   PetscFunctionReturn(0);
206324c233c2SKris Buschelman }
206424c233c2SKris Buschelman 
206524c233c2SKris Buschelman #endif
20660ef38995SBarry Smith 
20670ef38995SBarry Smith 
20684e2b4712SSatish Balay /*
20694e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
20704e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
20714e2b4712SSatish Balay */
20724a2ae208SSatish Balay #undef __FUNCT__
20734a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
20744e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
20754e2b4712SSatish Balay {
20764e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
207730d4dcafSBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
207830d4dcafSBarry Smith   int             ierr,*diag = a->diag;
20793f1db9ecSBarry Smith   MatScalar       *aa=a->a;
208087828ca2SBarry Smith   PetscScalar     *x,*b;
20814e2b4712SSatish Balay 
20824e2b4712SSatish Balay   PetscFunctionBegin;
2083b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2084b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
20854e2b4712SSatish Balay 
2086aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
20872853dc0eSBarry Smith   {
208887828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
20892853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
20902853dc0eSBarry Smith   }
2091aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
20922853dc0eSBarry Smith   {
209387828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
20942853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
20952853dc0eSBarry Smith   }
2096aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
20972853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2098e1293385SBarry Smith #else
209930d4dcafSBarry Smith   {
210087828ca2SBarry Smith     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
21013f1db9ecSBarry Smith     MatScalar    *v;
21024e555682SBarry Smith     int          jdx,idt,idx,nz,*vi,i,ai16;
2103e1293385SBarry Smith 
21044e2b4712SSatish Balay   /* forward solve the lower triangular */
21054e2b4712SSatish Balay   idx    = 0;
2106e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
21074e2b4712SSatish Balay   for (i=1; i<n; i++) {
21084e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
21094e2b4712SSatish Balay     vi    =  aj      + ai[i];
21104e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2111e1293385SBarry Smith     idx   +=  4;
2112f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
21134e2b4712SSatish Balay     while (nz--) {
21144e2b4712SSatish Balay       jdx   = 4*(*vi++);
21154e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2116f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2117f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2118f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2119f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
21204e2b4712SSatish Balay       v    += 16;
21214e2b4712SSatish Balay     }
2122f1af5d2fSBarry Smith     x[idx]   = s1;
2123f1af5d2fSBarry Smith     x[1+idx] = s2;
2124f1af5d2fSBarry Smith     x[2+idx] = s3;
2125f1af5d2fSBarry Smith     x[3+idx] = s4;
21264e2b4712SSatish Balay   }
21274e2b4712SSatish Balay   /* backward solve the upper triangular */
21284e555682SBarry Smith   idt = 4*(n-1);
21294e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21304e555682SBarry Smith     ai16 = 16*diag[i];
21314e555682SBarry Smith     v    = aa + ai16 + 16;
21324e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21334e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2134f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2135f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
21364e2b4712SSatish Balay     while (nz--) {
21374e2b4712SSatish Balay       idx   = 4*(*vi++);
21384e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2139f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2140f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2141f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2142f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
21434e2b4712SSatish Balay       v    += 16;
21444e2b4712SSatish Balay     }
21454e555682SBarry Smith     v        = aa + ai16;
2146f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2147f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2148f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2149f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2150329f5518SBarry Smith     idt -= 4;
21514e2b4712SSatish Balay   }
215230d4dcafSBarry Smith   }
2153e1293385SBarry Smith #endif
21544e2b4712SSatish Balay 
2155b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2156b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2157b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
21584e2b4712SSatish Balay   PetscFunctionReturn(0);
21594e2b4712SSatish Balay }
21604e2b4712SSatish Balay 
2161f26ec98cSKris Buschelman #undef __FUNCT__
2162f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2163f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2164f26ec98cSKris Buschelman {
2165f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2166f26ec98cSKris Buschelman   int             n=a->mbs,*ai=a->i,*aj=a->j;
2167f26ec98cSKris Buschelman   int             ierr,*diag = a->diag;
2168f26ec98cSKris Buschelman   MatScalar       *aa=a->a;
2169f26ec98cSKris Buschelman   PetscScalar     *x,*b;
2170f26ec98cSKris Buschelman 
2171f26ec98cSKris Buschelman   PetscFunctionBegin;
2172b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2173b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
2174f26ec98cSKris Buschelman 
2175f26ec98cSKris Buschelman   {
2176f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2177f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2178f26ec98cSKris Buschelman     int        jdx,idt,idx,nz,*vi,i,ai16;
2179f26ec98cSKris Buschelman 
2180f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2181f26ec98cSKris Buschelman     idx  = 0;
2182f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2183f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2184f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2185f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2186f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2187f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2188f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2189f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2190f26ec98cSKris Buschelman       idx   +=  4;
2191f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2192f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2193f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2194f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2195f26ec98cSKris Buschelman       while (nz--) {
2196f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2197f26ec98cSKris Buschelman         x1  = t[jdx];
2198f26ec98cSKris Buschelman         x2  = t[1+jdx];
2199f26ec98cSKris Buschelman         x3  = t[2+jdx];
2200f26ec98cSKris Buschelman         x4  = t[3+jdx];
2201f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2202f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2203f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2204f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2205f26ec98cSKris Buschelman         v    += 16;
2206f26ec98cSKris Buschelman       }
2207f26ec98cSKris Buschelman       t[idx]   = s1;
2208f26ec98cSKris Buschelman       t[1+idx] = s2;
2209f26ec98cSKris Buschelman       t[2+idx] = s3;
2210f26ec98cSKris Buschelman       t[3+idx] = s4;
2211f26ec98cSKris Buschelman     }
2212f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2213f26ec98cSKris Buschelman     idt = 4*(n-1);
2214f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2215f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2216f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2217f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2218f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2219f26ec98cSKris Buschelman       s1   = t[idt];
2220f26ec98cSKris Buschelman       s2   = t[1+idt];
2221f26ec98cSKris Buschelman       s3   = t[2+idt];
2222f26ec98cSKris Buschelman       s4   = t[3+idt];
2223f26ec98cSKris Buschelman       while (nz--) {
2224f26ec98cSKris Buschelman         idx = 4*(*vi++);
2225f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2226f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2227f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2228f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2229f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2230f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2231f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2232f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2233f26ec98cSKris Buschelman         v    += 16;
2234f26ec98cSKris Buschelman       }
2235f26ec98cSKris Buschelman       v        = aa + ai16;
2236f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2237f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2238f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2239f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2240f26ec98cSKris Buschelman       idt -= 4;
2241f26ec98cSKris Buschelman     }
2242f26ec98cSKris Buschelman   }
2243f26ec98cSKris Buschelman 
2244b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2245b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2246f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2247f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2248f26ec98cSKris Buschelman }
2249f26ec98cSKris Buschelman 
22503660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
22513660e330SKris Buschelman 
22523660e330SKris Buschelman #include PETSC_HAVE_SSE
22533660e330SKris Buschelman #undef __FUNCT__
22547cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
22557cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
22563660e330SKris Buschelman {
22573660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
22582aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
22592aa5897fSKris Buschelman   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
22603660e330SKris Buschelman   MatScalar      *aa=a->a;
226187828ca2SBarry Smith   PetscScalar    *x,*b;
22623660e330SKris Buschelman 
22633660e330SKris Buschelman   PetscFunctionBegin;
22643660e330SKris Buschelman   SSE_SCOPE_BEGIN;
22653660e330SKris Buschelman   /*
22663660e330SKris Buschelman      Note: This code currently uses demotion of double
22673660e330SKris Buschelman      to float when performing the mixed-mode computation.
22683660e330SKris Buschelman      This may not be numerically reasonable for all applications.
22693660e330SKris Buschelman   */
22703660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
22713660e330SKris Buschelman 
22726f6a888dSBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
22736f6a888dSBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
22743660e330SKris Buschelman   {
2275eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2276eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
22772aa5897fSKris Buschelman     int            nz,i,idt,ai16;
22782aa5897fSKris Buschelman     unsigned int   jdx,idx;
22792aa5897fSKris Buschelman     unsigned short *vi;
2280eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
22813660e330SKris Buschelman 
2282eb05f457SKris Buschelman     /* First block is the identity. */
22833660e330SKris Buschelman     idx  = 0;
2284eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
22852aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
22863660e330SKris Buschelman 
22873660e330SKris Buschelman     for (i=1; i<n;) {
22883660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
22893660e330SKris Buschelman       vi   =  aj      + ai[i];
22903660e330SKris Buschelman       nz   =  diag[i] - ai[i];
22913660e330SKris Buschelman       idx +=  4;
22923660e330SKris Buschelman 
2293eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2294eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2295eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
22963660e330SKris Buschelman 
22973660e330SKris Buschelman       while (nz--) {
22983660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
22992aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
23003660e330SKris Buschelman 
23013660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2302eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
23033660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23043660e330SKris Buschelman 
23053660e330SKris Buschelman           /* First Column */
23063660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23073660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23083660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23093660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23103660e330SKris Buschelman 
23113660e330SKris Buschelman           /* Second Column */
23123660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23133660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23143660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23153660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23163660e330SKris Buschelman 
23173660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23183660e330SKris Buschelman 
23193660e330SKris Buschelman           /* Third Column */
23203660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23213660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23223660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23233660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23243660e330SKris Buschelman 
23253660e330SKris Buschelman           /* Fourth Column */
23263660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23273660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23283660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23293660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23303660e330SKris Buschelman         SSE_INLINE_END_2
23313660e330SKris Buschelman 
23323660e330SKris Buschelman         v  += 16;
23333660e330SKris Buschelman       }
23343660e330SKris Buschelman       v    =  aa + 16*ai[++i];
23353660e330SKris Buschelman       PREFETCH_NTA(v);
2336eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
23373660e330SKris Buschelman     }
2338eb05f457SKris Buschelman 
2339eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2340eb05f457SKris Buschelman 
23413660e330SKris Buschelman     idt  = 4*(n-1);
23423660e330SKris Buschelman     ai16 = 16*diag[n-1];
23433660e330SKris Buschelman     v    = aa + ai16 + 16;
23443660e330SKris Buschelman     for (i=n-1; i>=0;){
23453660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23463660e330SKris Buschelman       vi = aj + diag[i] + 1;
23473660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
23483660e330SKris Buschelman 
2349eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
23503660e330SKris Buschelman 
23513660e330SKris Buschelman       while (nz--) {
23523660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23532aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
23543660e330SKris Buschelman 
23553660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2356eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
23573660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23583660e330SKris Buschelman 
23593660e330SKris Buschelman           /* First Column */
23603660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23613660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23623660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23633660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23643660e330SKris Buschelman 
23653660e330SKris Buschelman           /* Second Column */
23663660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23673660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23683660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23693660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23703660e330SKris Buschelman 
23713660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23723660e330SKris Buschelman 
23733660e330SKris Buschelman           /* Third Column */
23743660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23753660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23763660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23773660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23783660e330SKris Buschelman 
23793660e330SKris Buschelman           /* Fourth Column */
23803660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23813660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23823660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23833660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23843660e330SKris Buschelman         SSE_INLINE_END_2
23853660e330SKris Buschelman         v  += 16;
23863660e330SKris Buschelman       }
23873660e330SKris Buschelman       v    = aa + ai16;
23883660e330SKris Buschelman       ai16 = 16*diag[--i];
23893660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
23903660e330SKris Buschelman       /*
23913660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
23923660e330SKris Buschelman          which was inverted as part of the factorization
23933660e330SKris Buschelman       */
2394eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
23953660e330SKris Buschelman         /* First Column */
23963660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
23973660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
23983660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
23993660e330SKris Buschelman 
24003660e330SKris Buschelman         /* Second Column */
24013660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
24023660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
24033660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
24043660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
24053660e330SKris Buschelman 
24063660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
24073660e330SKris Buschelman 
24083660e330SKris Buschelman         /* Third Column */
24093660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
24103660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
24113660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
24123660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
24133660e330SKris Buschelman 
24143660e330SKris Buschelman         /* Fourth Column */
24153660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
24163660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
24173660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
24183660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
24193660e330SKris Buschelman 
24203660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
24213660e330SKris Buschelman       SSE_INLINE_END_3
24223660e330SKris Buschelman 
24233660e330SKris Buschelman       v    = aa + ai16 + 16;
24243660e330SKris Buschelman       idt -= 4;
24253660e330SKris Buschelman     }
2426eb05f457SKris Buschelman 
2427eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2428eb05f457SKris Buschelman     idt = 4*(n-1);
2429eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2430eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2431eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2432eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2433eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2434eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2435eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2436eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2437eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
243854693613SKris Buschelman       idt -= 4;
24393660e330SKris Buschelman     }
2440eb05f457SKris Buschelman 
2441eb05f457SKris Buschelman   } /* End of artificial scope. */
24426f6a888dSBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
24436f6a888dSBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
24443660e330SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
24453660e330SKris Buschelman   SSE_SCOPE_END;
24463660e330SKris Buschelman   PetscFunctionReturn(0);
24473660e330SKris Buschelman }
24483660e330SKris Buschelman 
24497cf1b8d3SKris Buschelman #undef __FUNCT__
24507cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
24517cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
24527cf1b8d3SKris Buschelman {
24537cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
24547cf1b8d3SKris Buschelman   int            *aj=a->j;
24557cf1b8d3SKris Buschelman   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
24567cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
24577cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
24587cf1b8d3SKris Buschelman 
24597cf1b8d3SKris Buschelman   PetscFunctionBegin;
24607cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
24617cf1b8d3SKris Buschelman   /*
24627cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
24637cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
24647cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
24657cf1b8d3SKris Buschelman   */
24667cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
24677cf1b8d3SKris Buschelman 
24687cf1b8d3SKris Buschelman   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
24697cf1b8d3SKris Buschelman   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
24707cf1b8d3SKris Buschelman   {
24717cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
24727cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
24737cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
24747cf1b8d3SKris Buschelman     int       jdx,idx;
24757cf1b8d3SKris Buschelman     int       *vi;
24767cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
24777cf1b8d3SKris Buschelman 
24787cf1b8d3SKris Buschelman     /* First block is the identity. */
24797cf1b8d3SKris Buschelman     idx  = 0;
24807cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
24817cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
24827cf1b8d3SKris Buschelman 
24837cf1b8d3SKris Buschelman     for (i=1; i<n;) {
24847cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
24857cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
24867cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
24877cf1b8d3SKris Buschelman       idx +=  4;
24887cf1b8d3SKris Buschelman 
24897cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
24907cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
24917cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
24927cf1b8d3SKris Buschelman 
24937cf1b8d3SKris Buschelman       while (nz--) {
24947cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
24957cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
24967cf1b8d3SKris Buschelman /*          jdx = *vi++; */
24977cf1b8d3SKris Buschelman 
24987cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
24997cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
25007cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25017cf1b8d3SKris Buschelman 
25027cf1b8d3SKris Buschelman           /* First Column */
25037cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25047cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25057cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25067cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25077cf1b8d3SKris Buschelman 
25087cf1b8d3SKris Buschelman           /* Second Column */
25097cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25107cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25117cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25127cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25137cf1b8d3SKris Buschelman 
25147cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25157cf1b8d3SKris Buschelman 
25167cf1b8d3SKris Buschelman           /* Third Column */
25177cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25187cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25197cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25207cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25217cf1b8d3SKris Buschelman 
25227cf1b8d3SKris Buschelman           /* Fourth Column */
25237cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25247cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25257cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25267cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25277cf1b8d3SKris Buschelman         SSE_INLINE_END_2
25287cf1b8d3SKris Buschelman 
25297cf1b8d3SKris Buschelman         v  += 16;
25307cf1b8d3SKris Buschelman       }
25317cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
25327cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
25337cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
25347cf1b8d3SKris Buschelman     }
25357cf1b8d3SKris Buschelman 
25367cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
25377cf1b8d3SKris Buschelman 
25387cf1b8d3SKris Buschelman     idt  = 4*(n-1);
25397cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
25407cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
25417cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
25427cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25437cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
25447cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
25457cf1b8d3SKris Buschelman 
25467cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
25477cf1b8d3SKris Buschelman 
25487cf1b8d3SKris Buschelman       while (nz--) {
25497cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25507cf1b8d3SKris Buschelman         idx = 4*(*vi++);
25517cf1b8d3SKris Buschelman /*          idx = *vi++; */
25527cf1b8d3SKris Buschelman 
25537cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
25547cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
25557cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25567cf1b8d3SKris Buschelman 
25577cf1b8d3SKris Buschelman           /* First Column */
25587cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25597cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25607cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25617cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25627cf1b8d3SKris Buschelman 
25637cf1b8d3SKris Buschelman           /* Second Column */
25647cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25657cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25667cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25677cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25687cf1b8d3SKris Buschelman 
25697cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25707cf1b8d3SKris Buschelman 
25717cf1b8d3SKris Buschelman           /* Third Column */
25727cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25737cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25747cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25757cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25767cf1b8d3SKris Buschelman 
25777cf1b8d3SKris Buschelman           /* Fourth Column */
25787cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25797cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25807cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25817cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25827cf1b8d3SKris Buschelman         SSE_INLINE_END_2
25837cf1b8d3SKris Buschelman         v  += 16;
25847cf1b8d3SKris Buschelman       }
25857cf1b8d3SKris Buschelman       v    = aa + ai16;
25867cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
25877cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
25887cf1b8d3SKris Buschelman       /*
25897cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
25907cf1b8d3SKris Buschelman          which was inverted as part of the factorization
25917cf1b8d3SKris Buschelman       */
25927cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
25937cf1b8d3SKris Buschelman         /* First Column */
25947cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
25957cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
25967cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
25977cf1b8d3SKris Buschelman 
25987cf1b8d3SKris Buschelman         /* Second Column */
25997cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
26007cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
26017cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
26027cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
26037cf1b8d3SKris Buschelman 
26047cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
26057cf1b8d3SKris Buschelman 
26067cf1b8d3SKris Buschelman         /* Third Column */
26077cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
26087cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
26097cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
26107cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
26117cf1b8d3SKris Buschelman 
26127cf1b8d3SKris Buschelman         /* Fourth Column */
26137cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
26147cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
26157cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
26167cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
26177cf1b8d3SKris Buschelman 
26187cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
26197cf1b8d3SKris Buschelman       SSE_INLINE_END_3
26207cf1b8d3SKris Buschelman 
26217cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
26227cf1b8d3SKris Buschelman       idt -= 4;
26237cf1b8d3SKris Buschelman     }
26247cf1b8d3SKris Buschelman 
26257cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
26267cf1b8d3SKris Buschelman     idt = 4*(n-1);
26277cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
26287cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
26297cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
26307cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
26317cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
26327cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
26337cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
26347cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
26357cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
26367cf1b8d3SKris Buschelman       idt -= 4;
26377cf1b8d3SKris Buschelman     }
26387cf1b8d3SKris Buschelman 
26397cf1b8d3SKris Buschelman   } /* End of artificial scope. */
26407cf1b8d3SKris Buschelman   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
26417cf1b8d3SKris Buschelman   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
26427cf1b8d3SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
26437cf1b8d3SKris Buschelman   SSE_SCOPE_END;
26447cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
26457cf1b8d3SKris Buschelman }
26467cf1b8d3SKris Buschelman 
26473660e330SKris Buschelman #endif
26484a2ae208SSatish Balay #undef __FUNCT__
26494a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
26504e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
26514e2b4712SSatish Balay {
26524e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
26534e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
26544e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
26554e2b4712SSatish Balay   int             *diag = a->diag;
26563f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
265787828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
26584e2b4712SSatish Balay 
26594e2b4712SSatish Balay   PetscFunctionBegin;
2660b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2661b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
2662f1af5d2fSBarry Smith   t  = a->solve_work;
26634e2b4712SSatish Balay 
26644e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
26654e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26664e2b4712SSatish Balay 
26674e2b4712SSatish Balay   /* forward solve the lower triangular */
26684e2b4712SSatish Balay   idx    = 3*(*r++);
2669f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
26704e2b4712SSatish Balay   for (i=1; i<n; i++) {
26714e2b4712SSatish Balay     v     = aa + 9*ai[i];
26724e2b4712SSatish Balay     vi    = aj + ai[i];
26734e2b4712SSatish Balay     nz    = diag[i] - ai[i];
26744e2b4712SSatish Balay     idx   = 3*(*r++);
2675f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
26764e2b4712SSatish Balay     while (nz--) {
26774e2b4712SSatish Balay       idx   = 3*(*vi++);
2678f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2679f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2680f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2681f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
26824e2b4712SSatish Balay       v += 9;
26834e2b4712SSatish Balay     }
26844e2b4712SSatish Balay     idx = 3*i;
2685f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
26864e2b4712SSatish Balay   }
26874e2b4712SSatish Balay   /* backward solve the upper triangular */
26884e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
26894e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
26904e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26914e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26924e2b4712SSatish Balay     idt  = 3*i;
2693f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
26944e2b4712SSatish Balay     while (nz--) {
26954e2b4712SSatish Balay       idx   = 3*(*vi++);
2696f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2697f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2698f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2699f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27004e2b4712SSatish Balay       v += 9;
27014e2b4712SSatish Balay     }
27024e2b4712SSatish Balay     idc = 3*(*c--);
27034e2b4712SSatish Balay     v   = aa + 9*diag[i];
2704f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2705f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2706f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
27074e2b4712SSatish Balay   }
27084e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27094e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2710b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2711b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2712b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
27134e2b4712SSatish Balay   PetscFunctionReturn(0);
27144e2b4712SSatish Balay }
27154e2b4712SSatish Balay 
271615091d37SBarry Smith /*
271715091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
271815091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
271915091d37SBarry Smith */
27204a2ae208SSatish Balay #undef __FUNCT__
27214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
272215091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
272315091d37SBarry Smith {
272415091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
272515091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
272615091d37SBarry Smith   int             ierr,*diag = a->diag;
272715091d37SBarry Smith   MatScalar       *aa=a->a,*v;
272887828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
272915091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
273015091d37SBarry Smith 
273115091d37SBarry Smith   PetscFunctionBegin;
2732b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2733b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
273415091d37SBarry Smith 
273515091d37SBarry Smith 
273615091d37SBarry Smith   /* forward solve the lower triangular */
273715091d37SBarry Smith   idx    = 0;
273815091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
273915091d37SBarry Smith   for (i=1; i<n; i++) {
274015091d37SBarry Smith     v     =  aa      + 9*ai[i];
274115091d37SBarry Smith     vi    =  aj      + ai[i];
274215091d37SBarry Smith     nz    =  diag[i] - ai[i];
274315091d37SBarry Smith     idx   +=  3;
2744f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
274515091d37SBarry Smith     while (nz--) {
274615091d37SBarry Smith       jdx   = 3*(*vi++);
274715091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2748f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2749f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2750f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
275115091d37SBarry Smith       v    += 9;
275215091d37SBarry Smith     }
2753f1af5d2fSBarry Smith     x[idx]   = s1;
2754f1af5d2fSBarry Smith     x[1+idx] = s2;
2755f1af5d2fSBarry Smith     x[2+idx] = s3;
275615091d37SBarry Smith   }
275715091d37SBarry Smith   /* backward solve the upper triangular */
275815091d37SBarry Smith   for (i=n-1; i>=0; i--){
275915091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
276015091d37SBarry Smith     vi   = aj + diag[i] + 1;
276115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
276215091d37SBarry Smith     idt  = 3*i;
2763f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2764f1af5d2fSBarry Smith     s3 = x[2+idt];
276515091d37SBarry Smith     while (nz--) {
276615091d37SBarry Smith       idx   = 3*(*vi++);
276715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2768f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2769f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2770f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
277115091d37SBarry Smith       v    += 9;
277215091d37SBarry Smith     }
277315091d37SBarry Smith     v        = aa +  9*diag[i];
2774f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2775f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2776f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
277715091d37SBarry Smith   }
277815091d37SBarry Smith 
2779b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2780b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2781b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
278215091d37SBarry Smith   PetscFunctionReturn(0);
278315091d37SBarry Smith }
278415091d37SBarry Smith 
27854a2ae208SSatish Balay #undef __FUNCT__
27864a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
27874e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
27884e2b4712SSatish Balay {
27894e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
27904e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
27914e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
27924e2b4712SSatish Balay   int             *diag = a->diag;
27933f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
279487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
27954e2b4712SSatish Balay 
27964e2b4712SSatish Balay   PetscFunctionBegin;
2797b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2798b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
2799f1af5d2fSBarry Smith   t  = a->solve_work;
28004e2b4712SSatish Balay 
28014e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28024e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28034e2b4712SSatish Balay 
28044e2b4712SSatish Balay   /* forward solve the lower triangular */
28054e2b4712SSatish Balay   idx    = 2*(*r++);
2806f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
28074e2b4712SSatish Balay   for (i=1; i<n; i++) {
28084e2b4712SSatish Balay     v     = aa + 4*ai[i];
28094e2b4712SSatish Balay     vi    = aj + ai[i];
28104e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28114e2b4712SSatish Balay     idx   = 2*(*r++);
2812f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
28134e2b4712SSatish Balay     while (nz--) {
28144e2b4712SSatish Balay       idx   = 2*(*vi++);
2815f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2816f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2817f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28184e2b4712SSatish Balay       v += 4;
28194e2b4712SSatish Balay     }
28204e2b4712SSatish Balay     idx = 2*i;
2821f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
28224e2b4712SSatish Balay   }
28234e2b4712SSatish Balay   /* backward solve the upper triangular */
28244e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28254e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
28264e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28274e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28284e2b4712SSatish Balay     idt  = 2*i;
2829f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
28304e2b4712SSatish Balay     while (nz--) {
28314e2b4712SSatish Balay       idx   = 2*(*vi++);
2832f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2833f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2834f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28354e2b4712SSatish Balay       v += 4;
28364e2b4712SSatish Balay     }
28374e2b4712SSatish Balay     idc = 2*(*c--);
28384e2b4712SSatish Balay     v   = aa + 4*diag[i];
2839f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2840f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
28414e2b4712SSatish Balay   }
28424e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28434e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2844b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2845b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2846b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
28474e2b4712SSatish Balay   PetscFunctionReturn(0);
28484e2b4712SSatish Balay }
28494e2b4712SSatish Balay 
285015091d37SBarry Smith /*
285115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
285215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
285315091d37SBarry Smith */
28544a2ae208SSatish Balay #undef __FUNCT__
28554a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
285615091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
285715091d37SBarry Smith {
285815091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
285915091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
286015091d37SBarry Smith   int             ierr,*diag = a->diag;
286115091d37SBarry Smith   MatScalar       *aa=a->a,*v;
286287828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2;
286315091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
286415091d37SBarry Smith 
286515091d37SBarry Smith   PetscFunctionBegin;
2866b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2867b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
286815091d37SBarry Smith 
286915091d37SBarry Smith   /* forward solve the lower triangular */
287015091d37SBarry Smith   idx    = 0;
287115091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
287215091d37SBarry Smith   for (i=1; i<n; i++) {
287315091d37SBarry Smith     v     =  aa      + 4*ai[i];
287415091d37SBarry Smith     vi    =  aj      + ai[i];
287515091d37SBarry Smith     nz    =  diag[i] - ai[i];
287615091d37SBarry Smith     idx   +=  2;
2877f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
287815091d37SBarry Smith     while (nz--) {
287915091d37SBarry Smith       jdx   = 2*(*vi++);
288015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2881f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2882f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
288315091d37SBarry Smith       v    += 4;
288415091d37SBarry Smith     }
2885f1af5d2fSBarry Smith     x[idx]   = s1;
2886f1af5d2fSBarry Smith     x[1+idx] = s2;
288715091d37SBarry Smith   }
288815091d37SBarry Smith   /* backward solve the upper triangular */
288915091d37SBarry Smith   for (i=n-1; i>=0; i--){
289015091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
289115091d37SBarry Smith     vi   = aj + diag[i] + 1;
289215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
289315091d37SBarry Smith     idt  = 2*i;
2894f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
289515091d37SBarry Smith     while (nz--) {
289615091d37SBarry Smith       idx   = 2*(*vi++);
289715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2898f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2899f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
290015091d37SBarry Smith       v    += 4;
290115091d37SBarry Smith     }
290215091d37SBarry Smith     v        = aa +  4*diag[i];
2903f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2904f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
290515091d37SBarry Smith   }
290615091d37SBarry Smith 
2907b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2908b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2909b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
291015091d37SBarry Smith   PetscFunctionReturn(0);
291115091d37SBarry Smith }
291215091d37SBarry Smith 
29134a2ae208SSatish Balay #undef __FUNCT__
29144a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
29154e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
29164e2b4712SSatish Balay {
29174e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
29184e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
29194e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
29204e2b4712SSatish Balay   int             *diag = a->diag;
29213f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
292287828ca2SBarry Smith   PetscScalar     *x,*b,s1,*t;
29234e2b4712SSatish Balay 
29244e2b4712SSatish Balay   PetscFunctionBegin;
29254e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
29264e2b4712SSatish Balay 
2927b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2928b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
2929f1af5d2fSBarry Smith   t  = a->solve_work;
29304e2b4712SSatish Balay 
29314e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29324e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29334e2b4712SSatish Balay 
29344e2b4712SSatish Balay   /* forward solve the lower triangular */
2935f1af5d2fSBarry Smith   t[0] = b[*r++];
29364e2b4712SSatish Balay   for (i=1; i<n; i++) {
29374e2b4712SSatish Balay     v     = aa + ai[i];
29384e2b4712SSatish Balay     vi    = aj + ai[i];
29394e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2940f1af5d2fSBarry Smith     s1  = b[*r++];
29414e2b4712SSatish Balay     while (nz--) {
2942f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29434e2b4712SSatish Balay     }
2944f1af5d2fSBarry Smith     t[i] = s1;
29454e2b4712SSatish Balay   }
29464e2b4712SSatish Balay   /* backward solve the upper triangular */
29474e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
29484e2b4712SSatish Balay     v    = aa + diag[i] + 1;
29494e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
29504e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2951f1af5d2fSBarry Smith     s1 = t[i];
29524e2b4712SSatish Balay     while (nz--) {
2953f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29544e2b4712SSatish Balay     }
2955f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
29564e2b4712SSatish Balay   }
29574e2b4712SSatish Balay 
29584e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
29594e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2960b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
2961b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
2962b0a32e0cSBarry Smith   PetscLogFlops(2*1*(a->nz) - A->n);
29634e2b4712SSatish Balay   PetscFunctionReturn(0);
29644e2b4712SSatish Balay }
296515091d37SBarry Smith /*
296615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
296715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
296815091d37SBarry Smith */
29694a2ae208SSatish Balay #undef __FUNCT__
29704a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
297115091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
297215091d37SBarry Smith {
297315091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
297415091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
297515091d37SBarry Smith   int             ierr,*diag = a->diag;
297615091d37SBarry Smith   MatScalar       *aa=a->a;
297787828ca2SBarry Smith   PetscScalar     *x,*b;
297887828ca2SBarry Smith   PetscScalar     s1,x1;
297915091d37SBarry Smith   MatScalar       *v;
298015091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
298115091d37SBarry Smith 
298215091d37SBarry Smith   PetscFunctionBegin;
2983b1d4fb26SBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
2984b1d4fb26SBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
298515091d37SBarry Smith 
298615091d37SBarry Smith   /* forward solve the lower triangular */
298715091d37SBarry Smith   idx    = 0;
298815091d37SBarry Smith   x[0]   = b[0];
298915091d37SBarry Smith   for (i=1; i<n; i++) {
299015091d37SBarry Smith     v     =  aa      + ai[i];
299115091d37SBarry Smith     vi    =  aj      + ai[i];
299215091d37SBarry Smith     nz    =  diag[i] - ai[i];
299315091d37SBarry Smith     idx   +=  1;
2994f1af5d2fSBarry Smith     s1  =  b[idx];
299515091d37SBarry Smith     while (nz--) {
299615091d37SBarry Smith       jdx   = *vi++;
299715091d37SBarry Smith       x1    = x[jdx];
2998f1af5d2fSBarry Smith       s1 -= v[0]*x1;
299915091d37SBarry Smith       v    += 1;
300015091d37SBarry Smith     }
3001f1af5d2fSBarry Smith     x[idx]   = s1;
300215091d37SBarry Smith   }
300315091d37SBarry Smith   /* backward solve the upper triangular */
300415091d37SBarry Smith   for (i=n-1; i>=0; i--){
300515091d37SBarry Smith     v    = aa + diag[i] + 1;
300615091d37SBarry Smith     vi   = aj + diag[i] + 1;
300715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
300815091d37SBarry Smith     idt  = i;
3009f1af5d2fSBarry Smith     s1 = x[idt];
301015091d37SBarry Smith     while (nz--) {
301115091d37SBarry Smith       idx   = *vi++;
301215091d37SBarry Smith       x1    = x[idx];
3013f1af5d2fSBarry Smith       s1 -= v[0]*x1;
301415091d37SBarry Smith       v    += 1;
301515091d37SBarry Smith     }
301615091d37SBarry Smith     v        = aa +  diag[i];
3017f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
301815091d37SBarry Smith   }
3019b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
3020b1d4fb26SBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
3021b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
302215091d37SBarry Smith   PetscFunctionReturn(0);
302315091d37SBarry Smith }
30244e2b4712SSatish Balay 
30254e2b4712SSatish Balay /* ----------------------------------------------------------------*/
30264e2b4712SSatish Balay /*
30274e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
30284e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
30294e2b4712SSatish Balay    Not a good example of code reuse.
30304e2b4712SSatish Balay */
3031ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
3032435faa5fSBarry Smith 
30334a2ae208SSatish Balay #undef __FUNCT__
30344a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3035b380c88cSHong Zhang int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatFactorInfo *info,Mat *fact)
30364e2b4712SSatish Balay {
30374e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
30384e2b4712SSatish Balay   IS          isicol;
30394e2b4712SSatish Balay   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
30404e2b4712SSatish Balay   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3041eb150c5cSKris Buschelman   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3042435faa5fSBarry Smith   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
30434533b203SBarry Smith   PetscTruth  col_identity,row_identity;
3044329f5518SBarry Smith   PetscReal   f;
30454e2b4712SSatish Balay 
30464e2b4712SSatish Balay   PetscFunctionBegin;
3047435faa5fSBarry Smith   f             = info->fill;
3048335d9088SBarry Smith   levels        = (int)info->levels;
3049335d9088SBarry Smith   diagonal_fill = (int)info->diagonal_fill;
30504c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3051667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3052667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3053309c388cSBarry Smith 
3054309c388cSBarry Smith   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3055bb3d539aSBarry Smith     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3056bb3d539aSBarry Smith     (*fact)->factor = FACTOR_LU;
3057bb3d539aSBarry Smith     b               = (Mat_SeqBAIJ*)(*fact)->data;
3058bb3d539aSBarry Smith     if (!b->diag) {
3059bb3d539aSBarry Smith       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3060bb3d539aSBarry Smith     }
3061bb3d539aSBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3062bb3d539aSBarry Smith     b->row        = isrow;
3063bb3d539aSBarry Smith     b->col        = iscol;
3064bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3065bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3066bb3d539aSBarry Smith     b->icol       = isicol;
3067bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
306887828ca2SBarry Smith     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3069309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
30704e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
30714e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
30724e2b4712SSatish Balay 
30734e2b4712SSatish Balay     /* get new row pointers */
3074b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
30754e2b4712SSatish Balay     ainew[0] = 0;
30764e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
30774e2b4712SSatish Balay     jmax = (int)(f*ai[n] + 1);
307882502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
30794e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
308082502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
30814e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3082b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
30834e2b4712SSatish Balay     /* im is level for each filled value */
3084b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
30854e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3086b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
30874e2b4712SSatish Balay     dloc[0]  = 0;
30884e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3089435faa5fSBarry Smith 
3090435faa5fSBarry Smith       /* copy prow into linked list */
30914e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
309229bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
30934e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
30944e2b4712SSatish Balay       fill[n]    = n;
3095435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
30964e2b4712SSatish Balay       while (nz--) {
30974e2b4712SSatish Balay 	fm  = n;
30984e2b4712SSatish Balay 	idx = ic[*xi++];
30994e2b4712SSatish Balay 	do {
31004e2b4712SSatish Balay 	  m  = fm;
31014e2b4712SSatish Balay 	  fm = fill[m];
31024e2b4712SSatish Balay 	} while (fm < idx);
31034e2b4712SSatish Balay 	fill[m]   = idx;
31044e2b4712SSatish Balay 	fill[idx] = fm;
31054e2b4712SSatish Balay 	im[idx]   = 0;
31064e2b4712SSatish Balay       }
3107435faa5fSBarry Smith 
3108435faa5fSBarry Smith       /* make sure diagonal entry is included */
3109435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3110435faa5fSBarry Smith 	fm = n;
3111435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3112435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3113435faa5fSBarry Smith 	fill[fm]   = prow;
3114435faa5fSBarry Smith 	im[prow]   = 0;
3115435faa5fSBarry Smith 	nzf++;
3116335d9088SBarry Smith 	dcount++;
3117435faa5fSBarry Smith       }
3118435faa5fSBarry Smith 
31194e2b4712SSatish Balay       nzi = 0;
31204e2b4712SSatish Balay       row = fill[n];
31214e2b4712SSatish Balay       while (row < prow) {
31224e2b4712SSatish Balay 	incrlev = im[row] + 1;
31234e2b4712SSatish Balay 	nz      = dloc[row];
3124435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
31254e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
31264e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
31274e2b4712SSatish Balay 	fm      = row;
31284e2b4712SSatish Balay 	while (nnz-- > 0) {
31294e2b4712SSatish Balay 	  idx = *xi++;
31304e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
31314e2b4712SSatish Balay 	    flev++;
31324e2b4712SSatish Balay 	    continue;
31334e2b4712SSatish Balay 	  }
31344e2b4712SSatish Balay 	  do {
31354e2b4712SSatish Balay 	    m  = fm;
31364e2b4712SSatish Balay 	    fm = fill[m];
31374e2b4712SSatish Balay 	  } while (fm < idx);
31384e2b4712SSatish Balay 	  if (fm != idx) {
31394e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
31404e2b4712SSatish Balay 	    fill[m]   = idx;
31414e2b4712SSatish Balay 	    fill[idx] = fm;
31424e2b4712SSatish Balay 	    fm        = idx;
31434e2b4712SSatish Balay 	    nzf++;
3144ecf371e4SBarry Smith 	  } else {
31454e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
31464e2b4712SSatish Balay 	  }
31474e2b4712SSatish Balay 	  flev++;
31484e2b4712SSatish Balay 	}
31494e2b4712SSatish Balay 	row = fill[row];
31504e2b4712SSatish Balay 	nzi++;
31514e2b4712SSatish Balay       }
31524e2b4712SSatish Balay       /* copy new filled row into permanent storage */
31534e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
31544e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3155ecf371e4SBarry Smith 
3156ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3157ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3158ecf371e4SBarry Smith 	/* just double the memory each time */
3159ecf371e4SBarry Smith 	int maxadd = jmax;
3160ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
31614e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
31624e2b4712SSatish Balay 	jmax += maxadd;
3163ecf371e4SBarry Smith 
3164ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
316582502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3166549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3167606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
31684e2b4712SSatish Balay 	ajnew = xi;
316982502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3170549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3171606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
31724e2b4712SSatish Balay 	ajfill = xi;
3173eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
31744e2b4712SSatish Balay       }
31754e2b4712SSatish Balay       xi          = ajnew + ainew[prow];
31764e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
31774e2b4712SSatish Balay       dloc[prow]  = nzi;
31784e2b4712SSatish Balay       fm          = fill[n];
31794e2b4712SSatish Balay       while (nzf--) {
31804e2b4712SSatish Balay 	*xi++   = fm;
31814e2b4712SSatish Balay 	*flev++ = im[fm];
31824e2b4712SSatish Balay 	fm      = fill[fm];
31834e2b4712SSatish Balay       }
3184435faa5fSBarry Smith       /* make sure row has diagonal entry */
3185435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
318629bbc08cSBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3187435faa5fSBarry Smith     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3188435faa5fSBarry Smith       }
31894e2b4712SSatish Balay     }
3190606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
31914e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
31924e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3193606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3194606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
31954e2b4712SSatish Balay 
31964e2b4712SSatish Balay     {
3197329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3198eb150c5cSKris Buschelman       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3199b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3200b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3201b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3202335d9088SBarry Smith       if (diagonal_fill) {
3203b1bcba4aSBarry Smith 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3204335d9088SBarry Smith       }
32054e2b4712SSatish Balay     }
32064e2b4712SSatish Balay 
32074e2b4712SSatish Balay     /* put together the new matrix */
3208*f204ca49SKris Buschelman     ierr = MatCreate(A->comm,bs*n,bs*n,bs*n,bs*n,fact);CHKERRQ(ierr);
3209*f204ca49SKris Buschelman     ierr = MatSetType(*fact,A->type_name);CHKERRQ(ierr);
3210*f204ca49SKris Buschelman     ierr = MatSeqBAIJSetPreallocation(*fact,bs,0,PETSC_NULL);CHKERRQ(ierr);
3211b0a32e0cSBarry Smith     PetscLogObjectParent(*fact,isicol);
32124e2b4712SSatish Balay     b = (Mat_SeqBAIJ*)(*fact)->data;
3213606d414cSSatish Balay     ierr = PetscFree(b->imax);CHKERRQ(ierr);
32147c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
32153f1db9ecSBarry Smith     len = bs2*ainew[n]*sizeof(MatScalar);
32164e2b4712SSatish Balay     /* the next line frees the default space generated by the Create() */
3217606d414cSSatish Balay     ierr = PetscFree(b->a);CHKERRQ(ierr);
3218606d414cSSatish Balay     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
321982502324SSatish Balay     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
32204e2b4712SSatish Balay     b->j          = ajnew;
32214e2b4712SSatish Balay     b->i          = ainew;
32224e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
32234e2b4712SSatish Balay     b->diag       = dloc;
32244e2b4712SSatish Balay     b->ilen       = 0;
32254e2b4712SSatish Balay     b->imax       = 0;
32264e2b4712SSatish Balay     b->row        = isrow;
32274e2b4712SSatish Balay     b->col        = iscol;
3228bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3229c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3230c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3231e51c0b9cSSatish Balay     b->icol       = isicol;
323287828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
32334e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
32344e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
323587828ca2SBarry Smith     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
32364e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
32374e2b4712SSatish Balay     (*fact)->factor   = FACTOR_LU;
32384e2b4712SSatish Balay 
3239eb150c5cSKris Buschelman     (*fact)->info.factor_mallocs    = reallocate;
32404e2b4712SSatish Balay     (*fact)->info.fill_ratio_given  = f;
3241329f5518SBarry Smith     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3242309c388cSBarry Smith   }
32434e2b4712SSatish Balay 
3244309c388cSBarry Smith   if (row_identity && col_identity) {
3245732ee342SKris Buschelman     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
32468661488fSKris Buschelman   }
32478661488fSKris Buschelman   PetscFunctionReturn(0);
32488661488fSKris Buschelman }
32498661488fSKris Buschelman 
3250732ee342SKris Buschelman #undef __FUNCT__
32517e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
32527e7071cdSKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
32537e7071cdSKris Buschelman {
325412272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
325512272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
32565a9542e3SKris Buschelman   PetscFunctionBegin;
32577cf1b8d3SKris Buschelman   /* Undo Column scaling */
32587cf1b8d3SKris Buschelman /*    while (nz--) { */
32597cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
32607cf1b8d3SKris Buschelman /*    } */
3261c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3262c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
32637cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
32647cf1b8d3SKris Buschelman }
32657cf1b8d3SKris Buschelman 
32667cf1b8d3SKris Buschelman #undef __FUNCT__
32677cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
32687cf1b8d3SKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
32697cf1b8d3SKris Buschelman {
32707cf1b8d3SKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
32710b9da03eSKris Buschelman   int *AJ=a->j,nz=a->nz;
32722aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
32735a9542e3SKris Buschelman   PetscFunctionBegin;
32740b9da03eSKris Buschelman   /* Is this really necessary? */
327520235379SKris Buschelman   while (nz--) {
32760b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
32777e7071cdSKris Buschelman   }
3278c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
32797e7071cdSKris Buschelman   PetscFunctionReturn(0);
32807e7071cdSKris Buschelman }
32817e7071cdSKris Buschelman 
32827e7071cdSKris Buschelman #undef __FUNCT__
3283732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3284732ee342SKris Buschelman int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
32858661488fSKris Buschelman {
32868661488fSKris Buschelman   /*
32878661488fSKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
32888661488fSKris Buschelman       with natural ordering
32898661488fSKris Buschelman   */
32908661488fSKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
32918661488fSKris Buschelman 
32928661488fSKris Buschelman   PetscFunctionBegin;
3293a7ba9c3cSKris Buschelman   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3294a7ba9c3cSKris Buschelman   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
32958661488fSKris Buschelman   switch (a->bs) {
32968661488fSKris Buschelman   case 1:
32978661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3298732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3299732ee342SKris Buschelman     break;
3300309c388cSBarry Smith   case 2:
33018661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3302732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3303309c388cSBarry Smith     break;
3304309c388cSBarry Smith   case 3:
33058661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3306732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3307309c388cSBarry Smith     break;
3308309c388cSBarry Smith   case 4:
3309a7d8d0baSKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3310a7d8d0baSKris Buschelman     {
3311a7d8d0baSKris Buschelman       PetscTruth  sse_enabled_local;
331243b9cc93SKris Buschelman       int         ierr;
3313ccaa8a1bSKris Buschelman       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
33146b7cc795SKris Buschelman       if (sse_enabled_local) {
3315b988c221SKris Buschelman #  if defined(PETSC_HAVE_SSE)
33167cf1b8d3SKris Buschelman         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
33177cf1b8d3SKris Buschelman         if (n==(unsigned short)n) {
33182aa5897fSKris Buschelman           unsigned short *aj=(unsigned short *)AJ;
331913c7ffeeSKris Buschelman           for (i=0;i<nz;i++) {
33202aa5897fSKris Buschelman             aj[i] = (unsigned short)AJ[i];
332113c7ffeeSKris Buschelman           }
33227cf1b8d3SKris Buschelman           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
33237cf1b8d3SKris Buschelman           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
332486b4ebfeSKris Buschelman           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
33257cf1b8d3SKris Buschelman         } else {
33267cf1b8d3SKris Buschelman         /* Scale the column indices for easier indexing in MatSolve. */
33277cf1b8d3SKris Buschelman /*            for (i=0;i<nz;i++) { */
33287cf1b8d3SKris Buschelman /*              AJ[i] = AJ[i]*4; */
33297cf1b8d3SKris Buschelman /*            } */
33307e7071cdSKris Buschelman           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
33318661488fSKris Buschelman           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
333286b4ebfeSKris Buschelman           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
33337cf1b8d3SKris Buschelman         }
3334b988c221SKris Buschelman #  else
3335b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3336b988c221SKris Buschelman         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3337b988c221SKris Buschelman #  endif
33383ba47ebaSKris Buschelman       } else {
33398661488fSKris Buschelman         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3340732ee342SKris Buschelman         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
33413ba47ebaSKris Buschelman       }
3342a7d8d0baSKris Buschelman     }
3343a7d8d0baSKris Buschelman #else
3344a7d8d0baSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3345a7d8d0baSKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3346a7d8d0baSKris Buschelman #endif
3347309c388cSBarry Smith     break;
3348309c388cSBarry Smith   case 5:
33498661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3350732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3351309c388cSBarry Smith     break;
3352309c388cSBarry Smith   case 6:
33538661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3354732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3355309c388cSBarry Smith     break;
3356309c388cSBarry Smith   case 7:
33578661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3358732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3359309c388cSBarry Smith     break;
3360309c388cSBarry Smith   }
33614e2b4712SSatish Balay   PetscFunctionReturn(0);
33624e2b4712SSatish Balay }
3363732ee342SKris Buschelman 
3364732ee342SKris Buschelman #undef __FUNCT__
3365732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3366732ee342SKris Buschelman int MatSeqBAIJ_UpdateSolvers(Mat A)
3367732ee342SKris Buschelman {
3368732ee342SKris Buschelman   /*
3369732ee342SKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3370732ee342SKris Buschelman       with natural ordering
3371732ee342SKris Buschelman   */
3372732ee342SKris Buschelman   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3373732ee342SKris Buschelman   IS          row = a->row, col = a->col;
3374732ee342SKris Buschelman   PetscTruth  row_identity, col_identity;
337523c42b7cSKris Buschelman   PetscTruth  use_natural;
3376732ee342SKris Buschelman   int         ierr;
3377732ee342SKris Buschelman 
3378732ee342SKris Buschelman   PetscFunctionBegin;
3379cf242676SKris Buschelman 
338094ee7fc8SKris Buschelman   use_natural = PETSC_FALSE;
338121360622SBarry Smith   if (row && col) {
3382732ee342SKris Buschelman     ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3383732ee342SKris Buschelman     ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3384732ee342SKris Buschelman 
3385732ee342SKris Buschelman     if (row_identity && col_identity) {
3386732ee342SKris Buschelman       use_natural = PETSC_TRUE;
3387732ee342SKris Buschelman     }
338821360622SBarry Smith   } else {
338921360622SBarry Smith     use_natural = PETSC_TRUE;
339021360622SBarry Smith   }
339121360622SBarry Smith 
3392732ee342SKris Buschelman   switch (a->bs) {
3393732ee342SKris Buschelman   case 1:
3394732ee342SKris Buschelman     if (use_natural) {
3395732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3396732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3397732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3398732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3399732ee342SKris Buschelman     } else {
3400732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1;
3401732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3402732ee342SKris Buschelman     }
3403732ee342SKris Buschelman     break;
3404732ee342SKris Buschelman   case 2:
3405732ee342SKris Buschelman     if (use_natural) {
3406732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3407732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3408732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3409732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3410732ee342SKris Buschelman     } else {
3411732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2;
3412732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3413732ee342SKris Buschelman     }
3414732ee342SKris Buschelman     break;
3415732ee342SKris Buschelman   case 3:
3416732ee342SKris Buschelman     if (use_natural) {
3417732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3418732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3419732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3420732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3421732ee342SKris Buschelman     } else {
3422732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3;
3423732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3424732ee342SKris Buschelman     }
3425732ee342SKris Buschelman     break;
3426732ee342SKris Buschelman   case 4:
3427f26ec98cSKris Buschelman     {
3428123145dfSKris Buschelman       PetscTruth sse_enabled_local;
3429ccaa8a1bSKris Buschelman       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3430732ee342SKris Buschelman       if (use_natural) {
34312859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3432f26ec98cSKris Buschelman         if (sse_enabled_local) { /* Natural + Single + SSE */
3433eb150c5cSKris Buschelman #  if defined(PETSC_HAVE_SSE)
3434995eb297SKris Buschelman           int n=a->mbs;
3435995eb297SKris Buschelman           if (n==(unsigned short)n) {
3436995eb297SKris Buschelman             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3437995eb297SKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3438995eb297SKris Buschelman           } else {
3439732ee342SKris Buschelman             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
344086b4ebfeSKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3441995eb297SKris Buschelman           }
3442eb150c5cSKris Buschelman #  else
3443eb150c5cSKris Buschelman           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3444eb150c5cSKris Buschelman           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3445eb150c5cSKris Buschelman #  endif
3446f26ec98cSKris Buschelman         } else { /* Natural + Single */
3447f26ec98cSKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3448123145dfSKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3449f26ec98cSKris Buschelman         }
34502859b196SKris Buschelman #else
34512859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3452123145dfSKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
34532859b196SKris Buschelman #endif
3454732ee342SKris Buschelman         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3455123145dfSKris Buschelman         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3456f26ec98cSKris Buschelman       } else { /* Arbitrary ordering */
34572859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3458f26ec98cSKris Buschelman         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3459eb150c5cSKris Buschelman #  if defined(PETSC_HAVE_SSE)
3460732ee342SKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3461732ee342SKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3462eb150c5cSKris Buschelman #  else
3463eb150c5cSKris Buschelman           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3464eb150c5cSKris Buschelman           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3465eb150c5cSKris Buschelman #  endif
3466f26ec98cSKris Buschelman         } else { /* Arbitrary + Single */
3467f26ec98cSKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3468f26ec98cSKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3469732ee342SKris Buschelman         }
34702859b196SKris Buschelman #else
34712859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4;
34722859b196SKris Buschelman #endif
3473732ee342SKris Buschelman         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3474732ee342SKris Buschelman       }
3475f26ec98cSKris Buschelman     }
3476732ee342SKris Buschelman     break;
3477732ee342SKris Buschelman   case 5:
3478732ee342SKris Buschelman     if (use_natural) {
3479732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3480732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3481732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3482732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3483732ee342SKris Buschelman     } else {
3484732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5;
3485732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3486732ee342SKris Buschelman     }
3487732ee342SKris Buschelman     break;
3488732ee342SKris Buschelman   case 6:
3489732ee342SKris Buschelman     if (use_natural) {
3490732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3491732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3492732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3493732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3494732ee342SKris Buschelman     } else {
3495732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6;
3496732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3497732ee342SKris Buschelman     }
3498732ee342SKris Buschelman     break;
3499732ee342SKris Buschelman   case 7:
3500732ee342SKris Buschelman     if (use_natural) {
3501732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3502732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3503732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3504732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3505732ee342SKris Buschelman     } else {
3506732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7;
3507732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3508732ee342SKris Buschelman     }
3509732ee342SKris Buschelman     break;
351031801e53SKris Buschelman   default:
351131801e53SKris Buschelman     A->ops->solve             = MatSolve_SeqBAIJ_N;
351231801e53SKris Buschelman     break;
3513732ee342SKris Buschelman   }
3514732ee342SKris Buschelman   PetscFunctionReturn(0);
3515732ee342SKris Buschelman }
3516732ee342SKris Buschelman 
3517732ee342SKris Buschelman #undef __FUNCT__
3518732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3519732ee342SKris Buschelman int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3520732ee342SKris Buschelman   int ierr;
3521732ee342SKris Buschelman 
3522732ee342SKris Buschelman   PetscFunctionBegin;
3523732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3524cf242676SKris Buschelman   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3525732ee342SKris Buschelman     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3526cf242676SKris Buschelman   } else {
3527cf242676SKris Buschelman     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3528cf242676SKris Buschelman   }
3529732ee342SKris Buschelman   PetscFunctionReturn(0);
3530732ee342SKris Buschelman }
3531732ee342SKris Buschelman 
3532732ee342SKris Buschelman #undef __FUNCT__
3533732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3534732ee342SKris Buschelman int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3535732ee342SKris Buschelman   int ierr;
3536732ee342SKris Buschelman 
3537732ee342SKris Buschelman   PetscFunctionBegin;
3538732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3539732ee342SKris Buschelman   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3540732ee342SKris Buschelman   PetscFunctionReturn(0);
3541732ee342SKris Buschelman }
3542732ee342SKris Buschelman 
3543732ee342SKris Buschelman 
3544