xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 5a9542e37d17b70bb2082cfd8279d70e693aec6c)
173f4d377SMatthew Knepley /*$Id: baijfact2.c,v 1.72 2001/09/11 16:32:33 bsmith Exp $*/
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
74e2b4712SSatish Balay #include "src/vec/vecimpl.h"
84e2b4712SSatish Balay #include "src/inline/ilu.h"
974c49faeSBarry Smith #include "src/inline/dot.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17f1af5d2fSBarry Smith   int             *diag = a->diag;
18f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
1987828ca2SBarry Smith   PetscScalar     s1,*x,*b;
20f1af5d2fSBarry Smith 
21f1af5d2fSBarry Smith   PetscFunctionBegin;
22ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
23f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25f1af5d2fSBarry Smith 
26f1af5d2fSBarry Smith   /* forward solve the U^T */
27f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith     v     = aa + diag[i];
30f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
31ef66eb69SBarry Smith     s1    = (*v++)*x[i];
32f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
33f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
34f1af5d2fSBarry Smith     while (nz--) {
35f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
36f1af5d2fSBarry Smith     }
37f1af5d2fSBarry Smith     x[i]   = s1;
38f1af5d2fSBarry Smith   }
39f1af5d2fSBarry Smith   /* backward solve the L^T */
40f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
41f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
42f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
43f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
44f1af5d2fSBarry Smith     s1   = x[i];
45f1af5d2fSBarry Smith     while (nz--) {
46f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
47f1af5d2fSBarry Smith     }
48f1af5d2fSBarry Smith   }
49f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
52f1af5d2fSBarry Smith   PetscFunctionReturn(0);
53f1af5d2fSBarry Smith }
54f1af5d2fSBarry Smith 
554a2ae208SSatish Balay #undef __FUNCT__
564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58f1af5d2fSBarry Smith {
59f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
62f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
6387828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
6487828ca2SBarry Smith   PetscScalar     *x,*b;
65f1af5d2fSBarry Smith 
66f1af5d2fSBarry Smith   PetscFunctionBegin;
67ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70f1af5d2fSBarry Smith 
71f1af5d2fSBarry Smith   /* forward solve the U^T */
72f1af5d2fSBarry Smith   idx = 0;
73f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
76f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
77ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
78f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
79f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
80f1af5d2fSBarry Smith     v += 4;
81f1af5d2fSBarry Smith 
82f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
83f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
84f1af5d2fSBarry Smith     while (nz--) {
85f1af5d2fSBarry Smith       oidx = 2*(*vi++);
86f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88f1af5d2fSBarry Smith       v  += 4;
89f1af5d2fSBarry Smith     }
90f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
91f1af5d2fSBarry Smith     idx += 2;
92f1af5d2fSBarry Smith   }
93f1af5d2fSBarry Smith   /* backward solve the L^T */
94f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
95f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
96f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
97f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
98f1af5d2fSBarry Smith     idt  = 2*i;
99f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
100f1af5d2fSBarry Smith     while (nz--) {
101f1af5d2fSBarry Smith       idx   = 2*(*vi--);
102f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104f1af5d2fSBarry Smith       v -= 4;
105f1af5d2fSBarry Smith     }
106f1af5d2fSBarry Smith   }
107f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110f1af5d2fSBarry Smith   PetscFunctionReturn(0);
111f1af5d2fSBarry Smith }
112f1af5d2fSBarry Smith 
1134a2ae208SSatish Balay #undef __FUNCT__
1144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
1157c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116f1af5d2fSBarry Smith {
117f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
120f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
12187828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
12287828ca2SBarry Smith   PetscScalar     *x,*b;
123f1af5d2fSBarry Smith 
124f1af5d2fSBarry Smith   PetscFunctionBegin;
125ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
126f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
127f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   /* forward solve the U^T */
130f1af5d2fSBarry Smith   idx = 0;
131f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
132f1af5d2fSBarry Smith 
133f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
134f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
135ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139f1af5d2fSBarry Smith     v += 9;
140f1af5d2fSBarry Smith 
141f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
142f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
143f1af5d2fSBarry Smith     while (nz--) {
144f1af5d2fSBarry Smith       oidx = 3*(*vi++);
145f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148f1af5d2fSBarry Smith       v  += 9;
149f1af5d2fSBarry Smith     }
150f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151f1af5d2fSBarry Smith     idx += 3;
152f1af5d2fSBarry Smith   }
153f1af5d2fSBarry Smith   /* backward solve the L^T */
154f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
155f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
156f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
157f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
158f1af5d2fSBarry Smith     idt  = 3*i;
159f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160f1af5d2fSBarry Smith     while (nz--) {
161f1af5d2fSBarry Smith       idx   = 3*(*vi--);
162f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165f1af5d2fSBarry Smith       v -= 9;
166f1af5d2fSBarry Smith     }
167f1af5d2fSBarry Smith   }
168f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
169f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171f1af5d2fSBarry Smith   PetscFunctionReturn(0);
172f1af5d2fSBarry Smith }
173f1af5d2fSBarry Smith 
1744a2ae208SSatish Balay #undef __FUNCT__
1754a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
1767c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177f1af5d2fSBarry Smith {
178f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
181f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
18287828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
18387828ca2SBarry Smith   PetscScalar     *x,*b;
184f1af5d2fSBarry Smith 
185f1af5d2fSBarry Smith   PetscFunctionBegin;
186ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
187f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189f1af5d2fSBarry Smith 
190f1af5d2fSBarry Smith   /* forward solve the U^T */
191f1af5d2fSBarry Smith   idx = 0;
192f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
195f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
196ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201f1af5d2fSBarry Smith     v += 16;
202f1af5d2fSBarry Smith 
203f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
204f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
205f1af5d2fSBarry Smith     while (nz--) {
206f1af5d2fSBarry Smith       oidx = 4*(*vi++);
207f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211f1af5d2fSBarry Smith       v  += 16;
212f1af5d2fSBarry Smith     }
213f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214f1af5d2fSBarry Smith     idx += 4;
215f1af5d2fSBarry Smith   }
216f1af5d2fSBarry Smith   /* backward solve the L^T */
217f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
218f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
219f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
220f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
221f1af5d2fSBarry Smith     idt  = 4*i;
222f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223f1af5d2fSBarry Smith     while (nz--) {
224f1af5d2fSBarry Smith       idx   = 4*(*vi--);
225f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229f1af5d2fSBarry Smith       v -= 16;
230f1af5d2fSBarry Smith     }
231f1af5d2fSBarry Smith   }
232f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235f1af5d2fSBarry Smith   PetscFunctionReturn(0);
236f1af5d2fSBarry Smith }
237f1af5d2fSBarry Smith 
2384a2ae208SSatish Balay #undef __FUNCT__
2394a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
2407c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241f1af5d2fSBarry Smith {
242f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
245f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
24687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
24787828ca2SBarry Smith   PetscScalar     *x,*b;
248f1af5d2fSBarry Smith 
249f1af5d2fSBarry Smith   PetscFunctionBegin;
250ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
251f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
252f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253f1af5d2fSBarry Smith 
254f1af5d2fSBarry Smith   /* forward solve the U^T */
255f1af5d2fSBarry Smith   idx = 0;
256f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
257f1af5d2fSBarry Smith 
258f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
259f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
260ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266f1af5d2fSBarry Smith     v += 25;
267f1af5d2fSBarry Smith 
268f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
269f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
270f1af5d2fSBarry Smith     while (nz--) {
271f1af5d2fSBarry Smith       oidx = 5*(*vi++);
272f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277f1af5d2fSBarry Smith       v  += 25;
278f1af5d2fSBarry Smith     }
279f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280f1af5d2fSBarry Smith     idx += 5;
281f1af5d2fSBarry Smith   }
282f1af5d2fSBarry Smith   /* backward solve the L^T */
283f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
284f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
285f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
286f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
287f1af5d2fSBarry Smith     idt  = 5*i;
288f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289f1af5d2fSBarry Smith     while (nz--) {
290f1af5d2fSBarry Smith       idx   = 5*(*vi--);
291f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296f1af5d2fSBarry Smith       v -= 25;
297f1af5d2fSBarry Smith     }
298f1af5d2fSBarry Smith   }
299f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
300f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
301b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302f1af5d2fSBarry Smith   PetscFunctionReturn(0);
303f1af5d2fSBarry Smith }
304f1af5d2fSBarry Smith 
3054a2ae208SSatish Balay #undef __FUNCT__
3064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
3077c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308f1af5d2fSBarry Smith {
309f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
312f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
31387828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
31487828ca2SBarry Smith   PetscScalar     *x,*b;
315f1af5d2fSBarry Smith 
316f1af5d2fSBarry Smith   PetscFunctionBegin;
317ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
318f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
319f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith   /* forward solve the U^T */
322f1af5d2fSBarry Smith   idx = 0;
323f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
324f1af5d2fSBarry Smith 
325f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
326f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
327ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328ef66eb69SBarry Smith     x6    = x[5+idx];
329f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335f1af5d2fSBarry Smith     v += 36;
336f1af5d2fSBarry Smith 
337f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
338f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
339f1af5d2fSBarry Smith     while (nz--) {
340f1af5d2fSBarry Smith       oidx = 6*(*vi++);
341f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347f1af5d2fSBarry Smith       v  += 36;
348f1af5d2fSBarry Smith     }
349f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350f1af5d2fSBarry Smith     x[5+idx] = s6;
351f1af5d2fSBarry Smith     idx += 6;
352f1af5d2fSBarry Smith   }
353f1af5d2fSBarry Smith   /* backward solve the L^T */
354f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
355f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
356f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
357f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
358f1af5d2fSBarry Smith     idt  = 6*i;
359f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360f1af5d2fSBarry Smith     s6 = x[5+idt];
361f1af5d2fSBarry Smith     while (nz--) {
362f1af5d2fSBarry Smith       idx   = 6*(*vi--);
363f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369f1af5d2fSBarry Smith       v -= 36;
370f1af5d2fSBarry Smith     }
371f1af5d2fSBarry Smith   }
372f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
373f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375f1af5d2fSBarry Smith   PetscFunctionReturn(0);
376f1af5d2fSBarry Smith }
377f1af5d2fSBarry Smith 
3784a2ae208SSatish Balay #undef __FUNCT__
3794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
3807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381f1af5d2fSBarry Smith {
382f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
385f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
38687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
38787828ca2SBarry Smith   PetscScalar     *x,*b;
388f1af5d2fSBarry Smith 
389f1af5d2fSBarry Smith   PetscFunctionBegin;
390ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
391f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
392f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393f1af5d2fSBarry Smith 
394f1af5d2fSBarry Smith   /* forward solve the U^T */
395f1af5d2fSBarry Smith   idx = 0;
396f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
399f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
400ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
402f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409f1af5d2fSBarry Smith     v += 49;
410f1af5d2fSBarry Smith 
411f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
412f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
413f1af5d2fSBarry Smith     while (nz--) {
414f1af5d2fSBarry Smith       oidx = 7*(*vi++);
415f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422f1af5d2fSBarry Smith       v  += 49;
423f1af5d2fSBarry Smith     }
424f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
426f1af5d2fSBarry Smith     idx += 7;
427f1af5d2fSBarry Smith   }
428f1af5d2fSBarry Smith   /* backward solve the L^T */
429f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
430f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
431f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
432f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
433f1af5d2fSBarry Smith     idt  = 7*i;
434f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
436f1af5d2fSBarry Smith     while (nz--) {
437f1af5d2fSBarry Smith       idx   = 7*(*vi--);
438f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445f1af5d2fSBarry Smith       v -= 49;
446f1af5d2fSBarry Smith     }
447f1af5d2fSBarry Smith   }
448f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451f1af5d2fSBarry Smith   PetscFunctionReturn(0);
452f1af5d2fSBarry Smith }
453f1af5d2fSBarry Smith 
454f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4554a2ae208SSatish Balay #undef __FUNCT__
4564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
4577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458f1af5d2fSBarry Smith {
459f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
461f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462f1af5d2fSBarry Smith   int             *diag = a->diag;
463f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
46487828ca2SBarry Smith   PetscScalar     s1,*x,*b,*t;
465f1af5d2fSBarry Smith 
466f1af5d2fSBarry Smith   PetscFunctionBegin;
467f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
468f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
469f1af5d2fSBarry Smith   t  = a->solve_work;
470f1af5d2fSBarry Smith 
471f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
472f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
473f1af5d2fSBarry Smith 
474f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
475f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
476f1af5d2fSBarry Smith     t[i] = b[c[i]];
477f1af5d2fSBarry Smith   }
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith   /* forward solve the U^T */
480f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith     v     = aa + diag[i];
483f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
484f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
485f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
486f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
487f1af5d2fSBarry Smith     while (nz--) {
488f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
489f1af5d2fSBarry Smith     }
490f1af5d2fSBarry Smith     t[i]   = s1;
491f1af5d2fSBarry Smith   }
492f1af5d2fSBarry Smith   /* backward solve the L^T */
493f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
494f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
495f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
496f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
497f1af5d2fSBarry Smith     s1   = t[i];
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith   }
502f1af5d2fSBarry Smith 
503f1af5d2fSBarry Smith   /* copy t into x according to permutation */
504f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
505f1af5d2fSBarry Smith     x[r[i]]   = t[i];
506f1af5d2fSBarry Smith   }
507f1af5d2fSBarry Smith 
508f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
509f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
510f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
512b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
513f1af5d2fSBarry Smith   PetscFunctionReturn(0);
514f1af5d2fSBarry Smith }
515f1af5d2fSBarry Smith 
5164a2ae208SSatish Balay #undef __FUNCT__
5174a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
5187c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519f1af5d2fSBarry Smith {
520f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
522f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
524f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
52587828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
52687828ca2SBarry Smith   PetscScalar     *x,*b,*t;
527f1af5d2fSBarry Smith 
528f1af5d2fSBarry Smith   PetscFunctionBegin;
529f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
530f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
531f1af5d2fSBarry Smith   t  = a->solve_work;
532f1af5d2fSBarry Smith 
533f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
534f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
535f1af5d2fSBarry Smith 
536f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
537f1af5d2fSBarry Smith   ii = 0;
538f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
539f1af5d2fSBarry Smith     ic      = 2*c[i];
540f1af5d2fSBarry Smith     t[ii]   = b[ic];
541f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
542f1af5d2fSBarry Smith     ii += 2;
543f1af5d2fSBarry Smith   }
544f1af5d2fSBarry Smith 
545f1af5d2fSBarry Smith   /* forward solve the U^T */
546f1af5d2fSBarry Smith   idx = 0;
547f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
550f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
551f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
552f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
553f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
554f1af5d2fSBarry Smith     v += 4;
555f1af5d2fSBarry Smith 
556f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
557f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
558f1af5d2fSBarry Smith     while (nz--) {
559f1af5d2fSBarry Smith       oidx = 2*(*vi++);
560f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562f1af5d2fSBarry Smith       v  += 4;
563f1af5d2fSBarry Smith     }
564f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
565f1af5d2fSBarry Smith     idx += 2;
566f1af5d2fSBarry Smith   }
567f1af5d2fSBarry Smith   /* backward solve the L^T */
568f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
569f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
570f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
571f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
572f1af5d2fSBarry Smith     idt  = 2*i;
573f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
574f1af5d2fSBarry Smith     while (nz--) {
575f1af5d2fSBarry Smith       idx   = 2*(*vi--);
576f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578f1af5d2fSBarry Smith       v -= 4;
579f1af5d2fSBarry Smith     }
580f1af5d2fSBarry Smith   }
581f1af5d2fSBarry Smith 
582f1af5d2fSBarry Smith   /* copy t into x according to permutation */
583f1af5d2fSBarry Smith   ii = 0;
584f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
585f1af5d2fSBarry Smith     ir      = 2*r[i];
586f1af5d2fSBarry Smith     x[ir]   = t[ii];
587f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
588f1af5d2fSBarry Smith     ii += 2;
589f1af5d2fSBarry Smith   }
590f1af5d2fSBarry Smith 
591f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
592f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
593f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
594f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
595b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596f1af5d2fSBarry Smith   PetscFunctionReturn(0);
597f1af5d2fSBarry Smith }
598f1af5d2fSBarry Smith 
5994a2ae208SSatish Balay #undef __FUNCT__
6004a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
6017c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602f1af5d2fSBarry Smith {
603f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
605f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
607f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
60887828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
60987828ca2SBarry Smith   PetscScalar     *x,*b,*t;
610f1af5d2fSBarry Smith 
611f1af5d2fSBarry Smith   PetscFunctionBegin;
612f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
613f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
614f1af5d2fSBarry Smith   t  = a->solve_work;
615f1af5d2fSBarry Smith 
616f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
617f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
618f1af5d2fSBarry Smith 
619f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
620f1af5d2fSBarry Smith   ii = 0;
621f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
622f1af5d2fSBarry Smith     ic      = 3*c[i];
623f1af5d2fSBarry Smith     t[ii]   = b[ic];
624f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
625f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
626f1af5d2fSBarry Smith     ii += 3;
627f1af5d2fSBarry Smith   }
628f1af5d2fSBarry Smith 
629f1af5d2fSBarry Smith   /* forward solve the U^T */
630f1af5d2fSBarry Smith   idx = 0;
631f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
632f1af5d2fSBarry Smith 
633f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
634f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
635f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639f1af5d2fSBarry Smith     v += 9;
640f1af5d2fSBarry Smith 
641f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
642f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
643f1af5d2fSBarry Smith     while (nz--) {
644f1af5d2fSBarry Smith       oidx = 3*(*vi++);
645f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648f1af5d2fSBarry Smith       v  += 9;
649f1af5d2fSBarry Smith     }
650f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651f1af5d2fSBarry Smith     idx += 3;
652f1af5d2fSBarry Smith   }
653f1af5d2fSBarry Smith   /* backward solve the L^T */
654f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
655f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
656f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
657f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
658f1af5d2fSBarry Smith     idt  = 3*i;
659f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660f1af5d2fSBarry Smith     while (nz--) {
661f1af5d2fSBarry Smith       idx   = 3*(*vi--);
662f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665f1af5d2fSBarry Smith       v -= 9;
666f1af5d2fSBarry Smith     }
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith 
669f1af5d2fSBarry Smith   /* copy t into x according to permutation */
670f1af5d2fSBarry Smith   ii = 0;
671f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
672f1af5d2fSBarry Smith     ir      = 3*r[i];
673f1af5d2fSBarry Smith     x[ir]   = t[ii];
674f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
675f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
676f1af5d2fSBarry Smith     ii += 3;
677f1af5d2fSBarry Smith   }
678f1af5d2fSBarry Smith 
679f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
680f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
681f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
682f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684f1af5d2fSBarry Smith   PetscFunctionReturn(0);
685f1af5d2fSBarry Smith }
686f1af5d2fSBarry Smith 
6874a2ae208SSatish Balay #undef __FUNCT__
6884a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
6897c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690f1af5d2fSBarry Smith {
691f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
693f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
695f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
69687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
69787828ca2SBarry Smith   PetscScalar     *x,*b,*t;
698f1af5d2fSBarry Smith 
699f1af5d2fSBarry Smith   PetscFunctionBegin;
700f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
701f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702f1af5d2fSBarry Smith   t  = a->solve_work;
703f1af5d2fSBarry Smith 
704f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
705f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
706f1af5d2fSBarry Smith 
707f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
708f1af5d2fSBarry Smith   ii = 0;
709f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
710f1af5d2fSBarry Smith     ic      = 4*c[i];
711f1af5d2fSBarry Smith     t[ii]   = b[ic];
712f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
713f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
714f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
715f1af5d2fSBarry Smith     ii += 4;
716f1af5d2fSBarry Smith   }
717f1af5d2fSBarry Smith 
718f1af5d2fSBarry Smith   /* forward solve the U^T */
719f1af5d2fSBarry Smith   idx = 0;
720f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
721f1af5d2fSBarry Smith 
722f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
723f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
724f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729f1af5d2fSBarry Smith     v += 16;
730f1af5d2fSBarry Smith 
731f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
732f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
733f1af5d2fSBarry Smith     while (nz--) {
734f1af5d2fSBarry Smith       oidx = 4*(*vi++);
735f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739f1af5d2fSBarry Smith       v  += 16;
740f1af5d2fSBarry Smith     }
741f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742f1af5d2fSBarry Smith     idx += 4;
743f1af5d2fSBarry Smith   }
744f1af5d2fSBarry Smith   /* backward solve the L^T */
745f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
746f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
747f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
748f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
749f1af5d2fSBarry Smith     idt  = 4*i;
750f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751f1af5d2fSBarry Smith     while (nz--) {
752f1af5d2fSBarry Smith       idx   = 4*(*vi--);
753f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757f1af5d2fSBarry Smith       v -= 16;
758f1af5d2fSBarry Smith     }
759f1af5d2fSBarry Smith   }
760f1af5d2fSBarry Smith 
761f1af5d2fSBarry Smith   /* copy t into x according to permutation */
762f1af5d2fSBarry Smith   ii = 0;
763f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
764f1af5d2fSBarry Smith     ir      = 4*r[i];
765f1af5d2fSBarry Smith     x[ir]   = t[ii];
766f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
767f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
768f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
769f1af5d2fSBarry Smith     ii += 4;
770f1af5d2fSBarry Smith   }
771f1af5d2fSBarry Smith 
772f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
773f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
774f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
775f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
776b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777f1af5d2fSBarry Smith   PetscFunctionReturn(0);
778f1af5d2fSBarry Smith }
779f1af5d2fSBarry Smith 
7804a2ae208SSatish Balay #undef __FUNCT__
7814a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
7827c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783f1af5d2fSBarry Smith {
784f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
786f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
788f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
78987828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
79087828ca2SBarry Smith   PetscScalar     *x,*b,*t;
791f1af5d2fSBarry Smith 
792f1af5d2fSBarry Smith   PetscFunctionBegin;
793f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795f1af5d2fSBarry Smith   t  = a->solve_work;
796f1af5d2fSBarry Smith 
797f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
799f1af5d2fSBarry Smith 
800f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
801f1af5d2fSBarry Smith   ii = 0;
802f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
803f1af5d2fSBarry Smith     ic      = 5*c[i];
804f1af5d2fSBarry Smith     t[ii]   = b[ic];
805f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
806f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
807f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
808f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
809f1af5d2fSBarry Smith     ii += 5;
810f1af5d2fSBarry Smith   }
811f1af5d2fSBarry Smith 
812f1af5d2fSBarry Smith   /* forward solve the U^T */
813f1af5d2fSBarry Smith   idx = 0;
814f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
817f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
818f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824f1af5d2fSBarry Smith     v += 25;
825f1af5d2fSBarry Smith 
826f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
827f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
828f1af5d2fSBarry Smith     while (nz--) {
829f1af5d2fSBarry Smith       oidx = 5*(*vi++);
830f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835f1af5d2fSBarry Smith       v  += 25;
836f1af5d2fSBarry Smith     }
837f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838f1af5d2fSBarry Smith     idx += 5;
839f1af5d2fSBarry Smith   }
840f1af5d2fSBarry Smith   /* backward solve the L^T */
841f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
842f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
843f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
844f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
845f1af5d2fSBarry Smith     idt  = 5*i;
846f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       idx   = 5*(*vi--);
849f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v -= 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith   }
857f1af5d2fSBarry Smith 
858f1af5d2fSBarry Smith   /* copy t into x according to permutation */
859f1af5d2fSBarry Smith   ii = 0;
860f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
861f1af5d2fSBarry Smith     ir      = 5*r[i];
862f1af5d2fSBarry Smith     x[ir]   = t[ii];
863f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
864f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
865f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
866f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
867f1af5d2fSBarry Smith     ii += 5;
868f1af5d2fSBarry Smith   }
869f1af5d2fSBarry Smith 
870f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
871f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
872f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
873f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
874b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875f1af5d2fSBarry Smith   PetscFunctionReturn(0);
876f1af5d2fSBarry Smith }
877f1af5d2fSBarry Smith 
8784a2ae208SSatish Balay #undef __FUNCT__
8794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
8807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881f1af5d2fSBarry Smith {
882f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
884f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
886f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
88787828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
88887828ca2SBarry Smith   PetscScalar     *x,*b,*t;
889f1af5d2fSBarry Smith 
890f1af5d2fSBarry Smith   PetscFunctionBegin;
891f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
892f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
893f1af5d2fSBarry Smith   t  = a->solve_work;
894f1af5d2fSBarry Smith 
895f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
896f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
897f1af5d2fSBarry Smith 
898f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
899f1af5d2fSBarry Smith   ii = 0;
900f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
901f1af5d2fSBarry Smith     ic      = 6*c[i];
902f1af5d2fSBarry Smith     t[ii]   = b[ic];
903f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
904f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
905f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
906f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
907f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
908f1af5d2fSBarry Smith     ii += 6;
909f1af5d2fSBarry Smith   }
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   /* forward solve the U^T */
912f1af5d2fSBarry Smith   idx = 0;
913f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
914f1af5d2fSBarry Smith 
915f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
916f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
917f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918f1af5d2fSBarry Smith     x6    = t[5+idx];
919f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925f1af5d2fSBarry Smith     v += 36;
926f1af5d2fSBarry Smith 
927f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
928f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
929f1af5d2fSBarry Smith     while (nz--) {
930f1af5d2fSBarry Smith       oidx = 6*(*vi++);
931f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937f1af5d2fSBarry Smith       v  += 36;
938f1af5d2fSBarry Smith     }
939f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940f1af5d2fSBarry Smith     t[5+idx] = s6;
941f1af5d2fSBarry Smith     idx += 6;
942f1af5d2fSBarry Smith   }
943f1af5d2fSBarry Smith   /* backward solve the L^T */
944f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
945f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
946f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
947f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
948f1af5d2fSBarry Smith     idt  = 6*i;
949f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950f1af5d2fSBarry Smith     s6 = t[5+idt];
951f1af5d2fSBarry Smith     while (nz--) {
952f1af5d2fSBarry Smith       idx   = 6*(*vi--);
953f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959f1af5d2fSBarry Smith       v -= 36;
960f1af5d2fSBarry Smith     }
961f1af5d2fSBarry Smith   }
962f1af5d2fSBarry Smith 
963f1af5d2fSBarry Smith   /* copy t into x according to permutation */
964f1af5d2fSBarry Smith   ii = 0;
965f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
966f1af5d2fSBarry Smith     ir      = 6*r[i];
967f1af5d2fSBarry Smith     x[ir]   = t[ii];
968f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
969f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
970f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
971f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
972f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
973f1af5d2fSBarry Smith     ii += 6;
974f1af5d2fSBarry Smith   }
975f1af5d2fSBarry Smith 
976f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
977f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
978f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
979f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
980b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981f1af5d2fSBarry Smith   PetscFunctionReturn(0);
982f1af5d2fSBarry Smith }
983f1af5d2fSBarry Smith 
9844a2ae208SSatish Balay #undef __FUNCT__
9854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
9867c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987f1af5d2fSBarry Smith {
988f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
990f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
992f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
99387828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
99487828ca2SBarry Smith   PetscScalar     *x,*b,*t;
995f1af5d2fSBarry Smith 
996f1af5d2fSBarry Smith   PetscFunctionBegin;
997f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
999f1af5d2fSBarry Smith   t  = a->solve_work;
1000f1af5d2fSBarry Smith 
1001f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1002f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1003f1af5d2fSBarry Smith 
1004f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1005f1af5d2fSBarry Smith   ii = 0;
1006f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1007f1af5d2fSBarry Smith     ic      = 7*c[i];
1008f1af5d2fSBarry Smith     t[ii]   = b[ic];
1009f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1010f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1011f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1012f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1013f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1014f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1015f1af5d2fSBarry Smith     ii += 7;
1016f1af5d2fSBarry Smith   }
1017f1af5d2fSBarry Smith 
1018f1af5d2fSBarry Smith   /* forward solve the U^T */
1019f1af5d2fSBarry Smith   idx = 0;
1020f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1021f1af5d2fSBarry Smith 
1022f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1023f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1024f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1026f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033f1af5d2fSBarry Smith     v += 49;
1034f1af5d2fSBarry Smith 
1035f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1036f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1037f1af5d2fSBarry Smith     while (nz--) {
1038f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1039f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046f1af5d2fSBarry Smith       v  += 49;
1047f1af5d2fSBarry Smith     }
1048f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1050f1af5d2fSBarry Smith     idx += 7;
1051f1af5d2fSBarry Smith   }
1052f1af5d2fSBarry Smith   /* backward solve the L^T */
1053f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1054f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1055f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1056f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1057f1af5d2fSBarry Smith     idt  = 7*i;
1058f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1062f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v -= 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith   }
1072f1af5d2fSBarry Smith 
1073f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1074f1af5d2fSBarry Smith   ii = 0;
1075f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1076f1af5d2fSBarry Smith     ir      = 7*r[i];
1077f1af5d2fSBarry Smith     x[ir]   = t[ii];
1078f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1079f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1080f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1081f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1082f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1083f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1084f1af5d2fSBarry Smith     ii += 7;
1085f1af5d2fSBarry Smith   }
1086f1af5d2fSBarry Smith 
1087f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1093f1af5d2fSBarry Smith }
1094f1af5d2fSBarry Smith 
10954e2b4712SSatish Balay /* ----------------------------------------------------------- */
10964a2ae208SSatish Balay #undef __FUNCT__
10974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
10984e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
10994e2b4712SSatish Balay {
11004e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11014e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11024e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
11034e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
11043f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
110587828ca2SBarry Smith   PetscScalar     *x,*b,*s,*t,*ls;
11064e2b4712SSatish Balay 
11074e2b4712SSatish Balay   PetscFunctionBegin;
1108e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1109e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1110f1af5d2fSBarry Smith   t  = a->solve_work;
11114e2b4712SSatish Balay 
11124e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11134e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11144e2b4712SSatish Balay 
11154e2b4712SSatish Balay   /* forward solve the lower triangular */
111687828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11174e2b4712SSatish Balay   for (i=1; i<n; i++) {
11184e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11194e2b4712SSatish Balay     vi  = aj + ai[i];
11204e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1121f1af5d2fSBarry Smith     s = t + bs*i;
112287828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11234e2b4712SSatish Balay     while (nz--) {
1124f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11254e2b4712SSatish Balay       v += bs2;
11264e2b4712SSatish Balay     }
11274e2b4712SSatish Balay   }
11284e2b4712SSatish Balay   /* backward solve the upper triangular */
1129273d9f13SBarry Smith   ls = a->solve_work + A->n;
11304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11314e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11324e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11334e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
113487828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11354e2b4712SSatish Balay     while (nz--) {
1136f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11374e2b4712SSatish Balay       v += bs2;
11384e2b4712SSatish Balay     }
1139f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
114087828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11414e2b4712SSatish Balay   }
11424e2b4712SSatish Balay 
11434e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11444e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1145e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1146e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1147b0a32e0cSBarry Smith   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
11484e2b4712SSatish Balay   PetscFunctionReturn(0);
11494e2b4712SSatish Balay }
11504e2b4712SSatish Balay 
11514a2ae208SSatish Balay #undef __FUNCT__
11524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
11534e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11544e2b4712SSatish Balay {
11554e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11564e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11574e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
11584e2b4712SSatish Balay   int             *diag = a->diag;
11593f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
116087828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
116187828ca2SBarry Smith   PetscScalar     *x,*b,*t;
11624e2b4712SSatish Balay 
11634e2b4712SSatish Balay   PetscFunctionBegin;
1164e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1165e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166f1af5d2fSBarry Smith   t  = a->solve_work;
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11694e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11704e2b4712SSatish Balay 
11714e2b4712SSatish Balay   /* forward solve the lower triangular */
11724e2b4712SSatish Balay   idx    = 7*(*r++);
1173f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1174f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11764e2b4712SSatish Balay 
11774e2b4712SSatish Balay   for (i=1; i<n; i++) {
11784e2b4712SSatish Balay     v     = aa + 49*ai[i];
11794e2b4712SSatish Balay     vi    = aj + ai[i];
11804e2b4712SSatish Balay     nz    = diag[i] - ai[i];
11814e2b4712SSatish Balay     idx   = 7*(*r++);
1182f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
11844e2b4712SSatish Balay     while (nz--) {
11854e2b4712SSatish Balay       idx   = 7*(*vi++);
1186f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1188f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1189f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
11964e2b4712SSatish Balay       v += 49;
11974e2b4712SSatish Balay     }
11984e2b4712SSatish Balay     idx = 7*i;
1199f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1200f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12024e2b4712SSatish Balay   }
12034e2b4712SSatish Balay   /* backward solve the upper triangular */
12044e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12054e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12064e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12074e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12084e2b4712SSatish Balay     idt  = 7*i;
1209f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1210f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12124e2b4712SSatish Balay     while (nz--) {
12134e2b4712SSatish Balay       idx   = 7*(*vi++);
1214f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1215f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1217f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12244e2b4712SSatish Balay       v += 49;
12254e2b4712SSatish Balay     }
12264e2b4712SSatish Balay     idc = 7*(*c--);
12274e2b4712SSatish Balay     v   = aa + 49*diag[i];
1228f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12424e2b4712SSatish Balay   }
12434e2b4712SSatish Balay 
12444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1246e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1247e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1248b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
12494e2b4712SSatish Balay   PetscFunctionReturn(0);
12504e2b4712SSatish Balay }
12514e2b4712SSatish Balay 
12524a2ae208SSatish Balay #undef __FUNCT__
12534a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
125415091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
125515091d37SBarry Smith {
125615091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
125715091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
125815091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
125915091d37SBarry Smith   MatScalar       *aa=a->a,*v;
126087828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
126115091d37SBarry Smith 
126215091d37SBarry Smith   PetscFunctionBegin;
126315091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
126415091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126515091d37SBarry Smith   /* forward solve the lower triangular */
126615091d37SBarry Smith   idx    = 0;
126715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
126815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
126915091d37SBarry Smith   x[6] = b[6+idx];
127015091d37SBarry Smith   for (i=1; i<n; i++) {
127115091d37SBarry Smith     v     =  aa + 49*ai[i];
127215091d37SBarry Smith     vi    =  aj + ai[i];
127315091d37SBarry Smith     nz    =  diag[i] - ai[i];
127415091d37SBarry Smith     idx   =  7*i;
1275f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277f1af5d2fSBarry Smith     s7  =  b[6+idx];
127815091d37SBarry Smith     while (nz--) {
127915091d37SBarry Smith       jdx   = 7*(*vi++);
128015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
128115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
128215091d37SBarry Smith       x7    = x[6+jdx];
1283f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
129015091d37SBarry Smith       v += 49;
129115091d37SBarry Smith      }
1292f1af5d2fSBarry Smith     x[idx]   = s1;
1293f1af5d2fSBarry Smith     x[1+idx] = s2;
1294f1af5d2fSBarry Smith     x[2+idx] = s3;
1295f1af5d2fSBarry Smith     x[3+idx] = s4;
1296f1af5d2fSBarry Smith     x[4+idx] = s5;
1297f1af5d2fSBarry Smith     x[5+idx] = s6;
1298f1af5d2fSBarry Smith     x[6+idx] = s7;
129915091d37SBarry Smith   }
130015091d37SBarry Smith   /* backward solve the upper triangular */
130115091d37SBarry Smith   for (i=n-1; i>=0; i--){
130215091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
130315091d37SBarry Smith     vi   = aj + diag[i] + 1;
130415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
130515091d37SBarry Smith     idt  = 7*i;
1306f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1307f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1308f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1309f1af5d2fSBarry Smith     s7 = x[6+idt];
131015091d37SBarry Smith     while (nz--) {
131115091d37SBarry Smith       idx   = 7*(*vi++);
131215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
131315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
131415091d37SBarry Smith       x7    = x[6+idx];
1315f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
132215091d37SBarry Smith       v += 49;
132315091d37SBarry Smith     }
132415091d37SBarry Smith     v        = aa + 49*diag[i];
1325f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
133915091d37SBarry Smith   }
134015091d37SBarry Smith 
134115091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
134215091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1343b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
134415091d37SBarry Smith   PetscFunctionReturn(0);
134515091d37SBarry Smith }
134615091d37SBarry Smith 
13474a2ae208SSatish Balay #undef __FUNCT__
13484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
134915091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
135015091d37SBarry Smith {
135115091d37SBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
135215091d37SBarry Smith   IS              iscol=a->col,isrow=a->row;
135315091d37SBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
135415091d37SBarry Smith   int             *diag = a->diag;
135515091d37SBarry Smith   MatScalar       *aa=a->a,*v;
135687828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
135715091d37SBarry Smith 
135815091d37SBarry Smith   PetscFunctionBegin;
135915091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
136015091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361f1af5d2fSBarry Smith   t  = a->solve_work;
136215091d37SBarry Smith 
136315091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
136415091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
136515091d37SBarry Smith 
136615091d37SBarry Smith   /* forward solve the lower triangular */
136715091d37SBarry Smith   idx    = 6*(*r++);
1368f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1369f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1370f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
137115091d37SBarry Smith   for (i=1; i<n; i++) {
137215091d37SBarry Smith     v     = aa + 36*ai[i];
137315091d37SBarry Smith     vi    = aj + ai[i];
137415091d37SBarry Smith     nz    = diag[i] - ai[i];
137515091d37SBarry Smith     idx   = 6*(*r++);
1376f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
137815091d37SBarry Smith     while (nz--) {
137915091d37SBarry Smith       idx   = 6*(*vi++);
1380f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
138815091d37SBarry Smith       v += 36;
138915091d37SBarry Smith     }
139015091d37SBarry Smith     idx = 6*i;
1391f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1392f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1393f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
139415091d37SBarry Smith   }
139515091d37SBarry Smith   /* backward solve the upper triangular */
139615091d37SBarry Smith   for (i=n-1; i>=0; i--){
139715091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
139815091d37SBarry Smith     vi   = aj + diag[i] + 1;
139915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
140015091d37SBarry Smith     idt  = 6*i;
1401f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1402f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1403f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
140415091d37SBarry Smith     while (nz--) {
140515091d37SBarry Smith       idx   = 6*(*vi++);
1406f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1407f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1408f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1409f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
141515091d37SBarry Smith       v += 36;
141615091d37SBarry Smith     }
141715091d37SBarry Smith     idc = 6*(*c--);
141815091d37SBarry Smith     v   = aa + 36*diag[i];
1419f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
143115091d37SBarry Smith   }
143215091d37SBarry Smith 
143315091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
143415091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
143515091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
143615091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1437b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
143815091d37SBarry Smith   PetscFunctionReturn(0);
143915091d37SBarry Smith }
144015091d37SBarry Smith 
14414a2ae208SSatish Balay #undef __FUNCT__
14424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
144315091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
144415091d37SBarry Smith {
144515091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
144615091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
144715091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
144815091d37SBarry Smith   MatScalar       *aa=a->a,*v;
144987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
145015091d37SBarry Smith 
145115091d37SBarry Smith   PetscFunctionBegin;
145215091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
145315091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
145415091d37SBarry Smith   /* forward solve the lower triangular */
145515091d37SBarry Smith   idx    = 0;
145615091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
145715091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
145815091d37SBarry Smith   for (i=1; i<n; i++) {
145915091d37SBarry Smith     v     =  aa + 36*ai[i];
146015091d37SBarry Smith     vi    =  aj + ai[i];
146115091d37SBarry Smith     nz    =  diag[i] - ai[i];
146215091d37SBarry Smith     idx   =  6*i;
1463f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
146515091d37SBarry Smith     while (nz--) {
146615091d37SBarry Smith       jdx   = 6*(*vi++);
146715091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
146815091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
147515091d37SBarry Smith       v += 36;
147615091d37SBarry Smith      }
1477f1af5d2fSBarry Smith     x[idx]   = s1;
1478f1af5d2fSBarry Smith     x[1+idx] = s2;
1479f1af5d2fSBarry Smith     x[2+idx] = s3;
1480f1af5d2fSBarry Smith     x[3+idx] = s4;
1481f1af5d2fSBarry Smith     x[4+idx] = s5;
1482f1af5d2fSBarry Smith     x[5+idx] = s6;
148315091d37SBarry Smith   }
148415091d37SBarry Smith   /* backward solve the upper triangular */
148515091d37SBarry Smith   for (i=n-1; i>=0; i--){
148615091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
148715091d37SBarry Smith     vi   = aj + diag[i] + 1;
148815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
148915091d37SBarry Smith     idt  = 6*i;
1490f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1491f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1492f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
149315091d37SBarry Smith     while (nz--) {
149415091d37SBarry Smith       idx   = 6*(*vi++);
149515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
149615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150315091d37SBarry Smith       v += 36;
150415091d37SBarry Smith     }
150515091d37SBarry Smith     v        = aa + 36*diag[i];
1506f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
151215091d37SBarry Smith   }
151315091d37SBarry Smith 
151415091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
151515091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1516b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
151715091d37SBarry Smith   PetscFunctionReturn(0);
151815091d37SBarry Smith }
151915091d37SBarry Smith 
15204a2ae208SSatish Balay #undef __FUNCT__
15214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
15224e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15234e2b4712SSatish Balay {
15244e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15254e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
15264e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
15274e2b4712SSatish Balay   int             *diag = a->diag;
15283f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
152987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
15304e2b4712SSatish Balay 
15314e2b4712SSatish Balay   PetscFunctionBegin;
1532e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1533e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534f1af5d2fSBarry Smith   t  = a->solve_work;
15354e2b4712SSatish Balay 
15364e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15374e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15384e2b4712SSatish Balay 
15394e2b4712SSatish Balay   /* forward solve the lower triangular */
15404e2b4712SSatish Balay   idx    = 5*(*r++);
1541f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1542f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15434e2b4712SSatish Balay   for (i=1; i<n; i++) {
15444e2b4712SSatish Balay     v     = aa + 25*ai[i];
15454e2b4712SSatish Balay     vi    = aj + ai[i];
15464e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15474e2b4712SSatish Balay     idx   = 5*(*r++);
1548f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549f1af5d2fSBarry Smith     s5  = b[4+idx];
15504e2b4712SSatish Balay     while (nz--) {
15514e2b4712SSatish Balay       idx   = 5*(*vi++);
1552f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1554f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15594e2b4712SSatish Balay       v += 25;
15604e2b4712SSatish Balay     }
15614e2b4712SSatish Balay     idx = 5*i;
1562f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1563f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15644e2b4712SSatish Balay   }
15654e2b4712SSatish Balay   /* backward solve the upper triangular */
15664e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15674e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15684e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15694e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
15704e2b4712SSatish Balay     idt  = 5*i;
1571f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1572f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
15734e2b4712SSatish Balay     while (nz--) {
15744e2b4712SSatish Balay       idx   = 5*(*vi++);
1575f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1576f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15824e2b4712SSatish Balay       v += 25;
15834e2b4712SSatish Balay     }
15844e2b4712SSatish Balay     idc = 5*(*c--);
15854e2b4712SSatish Balay     v   = aa + 25*diag[i];
1586f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1588f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1590f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1592f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1594f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
15964e2b4712SSatish Balay   }
15974e2b4712SSatish Balay 
15984e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
15994e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1600e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1601e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1602b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
16034e2b4712SSatish Balay   PetscFunctionReturn(0);
16044e2b4712SSatish Balay }
16054e2b4712SSatish Balay 
16064a2ae208SSatish Balay #undef __FUNCT__
16074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
160815091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
160915091d37SBarry Smith {
161015091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
161115091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
161215091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
161315091d37SBarry Smith   MatScalar       *aa=a->a,*v;
161487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
161515091d37SBarry Smith 
161615091d37SBarry Smith   PetscFunctionBegin;
161715091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
161815091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
161915091d37SBarry Smith   /* forward solve the lower triangular */
162015091d37SBarry Smith   idx    = 0;
162115091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
162215091d37SBarry Smith   for (i=1; i<n; i++) {
162315091d37SBarry Smith     v     =  aa + 25*ai[i];
162415091d37SBarry Smith     vi    =  aj + ai[i];
162515091d37SBarry Smith     nz    =  diag[i] - ai[i];
162615091d37SBarry Smith     idx   =  5*i;
1627f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
162815091d37SBarry Smith     while (nz--) {
162915091d37SBarry Smith       jdx   = 5*(*vi++);
163015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
163615091d37SBarry Smith       v    += 25;
163715091d37SBarry Smith     }
1638f1af5d2fSBarry Smith     x[idx]   = s1;
1639f1af5d2fSBarry Smith     x[1+idx] = s2;
1640f1af5d2fSBarry Smith     x[2+idx] = s3;
1641f1af5d2fSBarry Smith     x[3+idx] = s4;
1642f1af5d2fSBarry Smith     x[4+idx] = s5;
164315091d37SBarry Smith   }
164415091d37SBarry Smith   /* backward solve the upper triangular */
164515091d37SBarry Smith   for (i=n-1; i>=0; i--){
164615091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
164715091d37SBarry Smith     vi   = aj + diag[i] + 1;
164815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
164915091d37SBarry Smith     idt  = 5*i;
1650f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1651f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
165215091d37SBarry Smith     while (nz--) {
165315091d37SBarry Smith       idx   = 5*(*vi++);
165415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
166015091d37SBarry Smith       v    += 25;
166115091d37SBarry Smith     }
166215091d37SBarry Smith     v        = aa + 25*diag[i];
1663f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
166815091d37SBarry Smith   }
166915091d37SBarry Smith 
167015091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
167115091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1672b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
167315091d37SBarry Smith   PetscFunctionReturn(0);
167415091d37SBarry Smith }
167515091d37SBarry Smith 
16764a2ae208SSatish Balay #undef __FUNCT__
16774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
16784e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
16794e2b4712SSatish Balay {
16804e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
16814e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
16824e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
16834e2b4712SSatish Balay   int             *diag = a->diag;
16843f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
168587828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
16864e2b4712SSatish Balay 
16874e2b4712SSatish Balay   PetscFunctionBegin;
1688e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1689e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1690f1af5d2fSBarry Smith   t  = a->solve_work;
16914e2b4712SSatish Balay 
16924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
16934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
16944e2b4712SSatish Balay 
16954e2b4712SSatish Balay   /* forward solve the lower triangular */
16964e2b4712SSatish Balay   idx    = 4*(*r++);
1697f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1698f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
16994e2b4712SSatish Balay   for (i=1; i<n; i++) {
17004e2b4712SSatish Balay     v     = aa + 16*ai[i];
17014e2b4712SSatish Balay     vi    = aj + ai[i];
17024e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17034e2b4712SSatish Balay     idx   = 4*(*r++);
1704f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17054e2b4712SSatish Balay     while (nz--) {
17064e2b4712SSatish Balay       idx   = 4*(*vi++);
1707f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17124e2b4712SSatish Balay       v    += 16;
17134e2b4712SSatish Balay     }
17144e2b4712SSatish Balay     idx        = 4*i;
1715f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1716f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17174e2b4712SSatish Balay   }
17184e2b4712SSatish Balay   /* backward solve the upper triangular */
17194e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17204e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17214e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17224e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17234e2b4712SSatish Balay     idt  = 4*i;
1724f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1725f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17264e2b4712SSatish Balay     while (nz--) {
17274e2b4712SSatish Balay       idx   = 4*(*vi++);
1728f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1729f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1730f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17344e2b4712SSatish Balay       v += 16;
17354e2b4712SSatish Balay     }
17364e2b4712SSatish Balay     idc      = 4*(*c--);
17374e2b4712SSatish Balay     v        = aa + 16*diag[i];
1738f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17424e2b4712SSatish Balay   }
17434e2b4712SSatish Balay 
17444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1746e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1747e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1748b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
17494e2b4712SSatish Balay   PetscFunctionReturn(0);
17504e2b4712SSatish Balay }
1751f26ec98cSKris Buschelman 
1752f26ec98cSKris Buschelman #undef __FUNCT__
1753f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1754f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1755f26ec98cSKris Buschelman {
1756f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
1757f26ec98cSKris Buschelman   IS              iscol=a->col,isrow=a->row;
1758f26ec98cSKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
1759f26ec98cSKris Buschelman   int             *diag = a->diag;
1760f26ec98cSKris Buschelman   MatScalar       *aa=a->a,*v,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1761f26ec98cSKris Buschelman   PetscScalar     *x,*b;
1762f26ec98cSKris Buschelman 
1763f26ec98cSKris Buschelman   PetscFunctionBegin;
1764f26ec98cSKris Buschelman   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1765f26ec98cSKris Buschelman   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1766f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1767f26ec98cSKris Buschelman 
1768f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1769f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1770f26ec98cSKris Buschelman 
1771f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1772f26ec98cSKris Buschelman   idx    = 4*(*r++);
1773f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1774f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1775f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1776f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1777f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1778f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1779f26ec98cSKris Buschelman     vi    = aj + ai[i];
1780f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1781f26ec98cSKris Buschelman     idx   = 4*(*r++);
1782f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1783f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1784f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1785f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1786f26ec98cSKris Buschelman     while (nz--) {
1787f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1788f26ec98cSKris Buschelman       x1  = t[idx];
1789f26ec98cSKris Buschelman       x2  = t[1+idx];
1790f26ec98cSKris Buschelman       x3  = t[2+idx];
1791f26ec98cSKris Buschelman       x4  = t[3+idx];
1792f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1793f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1794f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1795f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1796f26ec98cSKris Buschelman       v    += 16;
1797f26ec98cSKris Buschelman     }
1798f26ec98cSKris Buschelman     idx        = 4*i;
1799f26ec98cSKris Buschelman     t[idx]   = s1;
1800f26ec98cSKris Buschelman     t[1+idx] = s2;
1801f26ec98cSKris Buschelman     t[2+idx] = s3;
1802f26ec98cSKris Buschelman     t[3+idx] = s4;
1803f26ec98cSKris Buschelman   }
1804f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1805f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1806f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1807f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1808f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1809f26ec98cSKris Buschelman     idt  = 4*i;
1810f26ec98cSKris Buschelman     s1 = t[idt];
1811f26ec98cSKris Buschelman     s2 = t[1+idt];
1812f26ec98cSKris Buschelman     s3 = t[2+idt];
1813f26ec98cSKris Buschelman     s4 = t[3+idt];
1814f26ec98cSKris Buschelman     while (nz--) {
1815f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1816f26ec98cSKris Buschelman       x1  = t[idx];
1817f26ec98cSKris Buschelman       x2  = t[1+idx];
1818f26ec98cSKris Buschelman       x3  = t[2+idx];
1819f26ec98cSKris Buschelman       x4  = t[3+idx];
1820f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1821f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1822f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1823f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1824f26ec98cSKris Buschelman       v += 16;
1825f26ec98cSKris Buschelman     }
1826f26ec98cSKris Buschelman     idc      = 4*(*c--);
1827f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1828f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1829f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1830f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1831f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1832f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1833f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1834f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1835f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1836f26ec98cSKris Buschelman  }
1837f26ec98cSKris Buschelman 
1838f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1839f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1840f26ec98cSKris Buschelman   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1841f26ec98cSKris Buschelman   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1842f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
1843f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1844f26ec98cSKris Buschelman }
1845f26ec98cSKris Buschelman 
184624c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
184724c233c2SKris Buschelman 
184824c233c2SKris Buschelman #include PETSC_HAVE_SSE
184924c233c2SKris Buschelman 
185024c233c2SKris Buschelman #undef __FUNCT__
185124c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
185224c233c2SKris Buschelman int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
185324c233c2SKris Buschelman {
185424c233c2SKris Buschelman   /*
185524c233c2SKris Buschelman      Note: This code uses demotion of double
185624c233c2SKris Buschelman      to float when performing the mixed-mode computation.
185724c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
185824c233c2SKris Buschelman   */
185924c233c2SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
186024c233c2SKris Buschelman   IS              iscol=a->col,isrow=a->row;
186124c233c2SKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
186224c233c2SKris Buschelman   int             *diag = a->diag,ai16;
186324c233c2SKris Buschelman   MatScalar       *aa=a->a,*v;
186487828ca2SBarry Smith   PetscScalar     *x,*b,*t;
186524c233c2SKris Buschelman 
186624c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
186724c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
186824c233c2SKris Buschelman   unsigned long   offset;
186924c233c2SKris Buschelman 
187024c233c2SKris Buschelman   PetscFunctionBegin;
187124c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
187224c233c2SKris Buschelman 
187324c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
187424c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
187524c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
187624c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
187724c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
187824c233c2SKris Buschelman 
187924c233c2SKris Buschelman     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188024c233c2SKris Buschelman     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
188124c233c2SKris Buschelman     t  = a->solve_work;
188224c233c2SKris Buschelman 
188324c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
188424c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
188524c233c2SKris Buschelman 
188624c233c2SKris Buschelman     /* forward solve the lower triangular */
188724c233c2SKris Buschelman     idx  = 4*(*r++);
188824c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
188924c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
189024c233c2SKris Buschelman     v    =  aa + 16*ai[1];
189124c233c2SKris Buschelman 
189224c233c2SKris Buschelman     for (i=1; i<n;) {
189324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
189424c233c2SKris Buschelman       vi   =  aj      + ai[i];
189524c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
189624c233c2SKris Buschelman       idx  =  4*(*r++);
189724c233c2SKris Buschelman 
189824c233c2SKris Buschelman       /* Demote sum from double to float */
189924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
190024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
190124c233c2SKris Buschelman 
190224c233c2SKris Buschelman       while (nz--) {
190324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
190424c233c2SKris Buschelman         idx = 4*(*vi++);
190524c233c2SKris Buschelman 
190624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
190724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
190824c233c2SKris Buschelman 
190924c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
191024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
191124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
191224c233c2SKris Buschelman 
191324c233c2SKris Buschelman           /* First Column */
191424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
191524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
191624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
191724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
191824c233c2SKris Buschelman 
191924c233c2SKris Buschelman           /* Second Column */
192024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
192124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
192224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
192324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
192424c233c2SKris Buschelman 
192524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
192624c233c2SKris Buschelman 
192724c233c2SKris Buschelman           /* Third Column */
192824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
192924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
193024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
193124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
193224c233c2SKris Buschelman 
193324c233c2SKris Buschelman           /* Fourth Column */
193424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
193524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
193624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
193724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
193824c233c2SKris Buschelman         SSE_INLINE_END_2
193924c233c2SKris Buschelman 
194024c233c2SKris Buschelman         v  += 16;
194124c233c2SKris Buschelman       }
194224c233c2SKris Buschelman       idx = 4*i;
194324c233c2SKris Buschelman       v   = aa + 16*ai[++i];
194424c233c2SKris Buschelman       PREFETCH_NTA(v);
194524c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
194624c233c2SKris Buschelman 
194724c233c2SKris Buschelman       /* Promote result from float to double */
194824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
194924c233c2SKris Buschelman     }
195024c233c2SKris Buschelman     /* backward solve the upper triangular */
195124c233c2SKris Buschelman     idt  = 4*(n-1);
195224c233c2SKris Buschelman     ai16 = 16*diag[n-1];
195324c233c2SKris Buschelman     v    = aa + ai16 + 16;
195424c233c2SKris Buschelman     for (i=n-1; i>=0;){
195524c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
195624c233c2SKris Buschelman       vi = aj + diag[i] + 1;
195724c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
195824c233c2SKris Buschelman 
195924c233c2SKris Buschelman       /* Demote accumulator from double to float */
196024c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
196124c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
196224c233c2SKris Buschelman 
196324c233c2SKris Buschelman       while (nz--) {
196424c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
196524c233c2SKris Buschelman         idx = 4*(*vi++);
196624c233c2SKris Buschelman 
196724c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
196824c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
196924c233c2SKris Buschelman 
197024c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
197124c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
197224c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
197324c233c2SKris Buschelman 
197424c233c2SKris Buschelman           /* First Column */
197524c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
197624c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
197724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
197824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
197924c233c2SKris Buschelman 
198024c233c2SKris Buschelman           /* Second Column */
198124c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
198224c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
198324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
198424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
198524c233c2SKris Buschelman 
198624c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
198724c233c2SKris Buschelman 
198824c233c2SKris Buschelman           /* Third Column */
198924c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
199024c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
199124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
199224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
199324c233c2SKris Buschelman 
199424c233c2SKris Buschelman           /* Fourth Column */
199524c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
199624c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
199724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
199824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
199924c233c2SKris Buschelman         SSE_INLINE_END_2
200024c233c2SKris Buschelman         v  += 16;
200124c233c2SKris Buschelman       }
200224c233c2SKris Buschelman       v    = aa + ai16;
200324c233c2SKris Buschelman       ai16 = 16*diag[--i];
200424c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
200524c233c2SKris Buschelman       /*
200624c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
200724c233c2SKris Buschelman          which was inverted as part of the factorization
200824c233c2SKris Buschelman       */
200924c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
201024c233c2SKris Buschelman         /* First Column */
201124c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
201224c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
201324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
201424c233c2SKris Buschelman 
201524c233c2SKris Buschelman         /* Second Column */
201624c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
201724c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
201824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
201924c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
202024c233c2SKris Buschelman 
202124c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
202224c233c2SKris Buschelman 
202324c233c2SKris Buschelman         /* Third Column */
202424c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
202524c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
202624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
202724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
202824c233c2SKris Buschelman 
202924c233c2SKris Buschelman         /* Fourth Column */
203024c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
203124c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
203224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
203324c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
203424c233c2SKris Buschelman 
203524c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
203624c233c2SKris Buschelman       SSE_INLINE_END_3
203724c233c2SKris Buschelman 
203824c233c2SKris Buschelman       /* Promote solution from float to double */
203924c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
204024c233c2SKris Buschelman 
204124c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
204224c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
204324c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
204424c233c2SKris Buschelman       idc  = 4*(*c--);
204524c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
204624c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
204724c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
204824c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
204924c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
205024c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
205124c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
205224c233c2SKris Buschelman       SSE_INLINE_END_2
205324c233c2SKris Buschelman       v    = aa + ai16 + 16;
205424c233c2SKris Buschelman       idt -= 4;
205524c233c2SKris Buschelman     }
205624c233c2SKris Buschelman 
205724c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
205824c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
205924c233c2SKris Buschelman     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
206024c233c2SKris Buschelman     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
206124c233c2SKris Buschelman     PetscLogFlops(2*16*(a->nz) - 4*A->n);
206224c233c2SKris Buschelman   SSE_SCOPE_END;
206324c233c2SKris Buschelman   PetscFunctionReturn(0);
206424c233c2SKris Buschelman }
206524c233c2SKris Buschelman 
206624c233c2SKris Buschelman #endif
20670ef38995SBarry Smith 
20680ef38995SBarry Smith 
20694e2b4712SSatish Balay /*
20704e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
20714e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
20724e2b4712SSatish Balay */
20734a2ae208SSatish Balay #undef __FUNCT__
20744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
20754e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
20764e2b4712SSatish Balay {
20774e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
207830d4dcafSBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
207930d4dcafSBarry Smith   int             ierr,*diag = a->diag;
20803f1db9ecSBarry Smith   MatScalar       *aa=a->a;
208187828ca2SBarry Smith   PetscScalar     *x,*b;
20824e2b4712SSatish Balay 
20834e2b4712SSatish Balay   PetscFunctionBegin;
2084e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2085e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
20864e2b4712SSatish Balay 
2087aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
20882853dc0eSBarry Smith   {
208987828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
20902853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
20912853dc0eSBarry Smith   }
2092aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
20932853dc0eSBarry Smith   {
209487828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
20952853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
20962853dc0eSBarry Smith   }
2097aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
20982853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2099e1293385SBarry Smith #else
210030d4dcafSBarry Smith   {
210187828ca2SBarry Smith     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
21023f1db9ecSBarry Smith     MatScalar    *v;
21034e555682SBarry Smith     int          jdx,idt,idx,nz,*vi,i,ai16;
2104e1293385SBarry Smith 
21054e2b4712SSatish Balay   /* forward solve the lower triangular */
21064e2b4712SSatish Balay   idx    = 0;
2107e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
21084e2b4712SSatish Balay   for (i=1; i<n; i++) {
21094e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
21104e2b4712SSatish Balay     vi    =  aj      + ai[i];
21114e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2112e1293385SBarry Smith     idx   +=  4;
2113f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
21144e2b4712SSatish Balay     while (nz--) {
21154e2b4712SSatish Balay       jdx   = 4*(*vi++);
21164e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2117f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2118f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2119f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2120f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
21214e2b4712SSatish Balay       v    += 16;
21224e2b4712SSatish Balay     }
2123f1af5d2fSBarry Smith     x[idx]   = s1;
2124f1af5d2fSBarry Smith     x[1+idx] = s2;
2125f1af5d2fSBarry Smith     x[2+idx] = s3;
2126f1af5d2fSBarry Smith     x[3+idx] = s4;
21274e2b4712SSatish Balay   }
21284e2b4712SSatish Balay   /* backward solve the upper triangular */
21294e555682SBarry Smith   idt = 4*(n-1);
21304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21314e555682SBarry Smith     ai16 = 16*diag[i];
21324e555682SBarry Smith     v    = aa + ai16 + 16;
21334e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21344e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2135f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2136f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
21374e2b4712SSatish Balay     while (nz--) {
21384e2b4712SSatish Balay       idx   = 4*(*vi++);
21394e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2140f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2141f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2142f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2143f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
21444e2b4712SSatish Balay       v    += 16;
21454e2b4712SSatish Balay     }
21464e555682SBarry Smith     v        = aa + ai16;
2147f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2148f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2149f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2150f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2151329f5518SBarry Smith     idt -= 4;
21524e2b4712SSatish Balay   }
215330d4dcafSBarry Smith   }
2154e1293385SBarry Smith #endif
21554e2b4712SSatish Balay 
2156e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2157e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2158b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
21594e2b4712SSatish Balay   PetscFunctionReturn(0);
21604e2b4712SSatish Balay }
21614e2b4712SSatish Balay 
2162f26ec98cSKris Buschelman #undef __FUNCT__
2163f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2164f26ec98cSKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2165f26ec98cSKris Buschelman {
2166f26ec98cSKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
2167f26ec98cSKris Buschelman   int             n=a->mbs,*ai=a->i,*aj=a->j;
2168f26ec98cSKris Buschelman   int             ierr,*diag = a->diag;
2169f26ec98cSKris Buschelman   MatScalar       *aa=a->a;
2170f26ec98cSKris Buschelman   PetscScalar     *x,*b;
2171f26ec98cSKris Buschelman 
2172f26ec98cSKris Buschelman   PetscFunctionBegin;
2173f26ec98cSKris Buschelman   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2174f26ec98cSKris Buschelman   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2175f26ec98cSKris Buschelman 
2176f26ec98cSKris Buschelman   {
2177f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2178f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2179f26ec98cSKris Buschelman     int        jdx,idt,idx,nz,*vi,i,ai16;
2180f26ec98cSKris Buschelman 
2181f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2182f26ec98cSKris Buschelman     idx  = 0;
2183f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2184f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2185f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2186f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2187f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2188f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2189f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2190f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2191f26ec98cSKris Buschelman       idx   +=  4;
2192f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2193f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2194f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2195f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2196f26ec98cSKris Buschelman       while (nz--) {
2197f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2198f26ec98cSKris Buschelman         x1  = t[jdx];
2199f26ec98cSKris Buschelman         x2  = t[1+jdx];
2200f26ec98cSKris Buschelman         x3  = t[2+jdx];
2201f26ec98cSKris Buschelman         x4  = t[3+jdx];
2202f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2203f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2204f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2205f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2206f26ec98cSKris Buschelman         v    += 16;
2207f26ec98cSKris Buschelman       }
2208f26ec98cSKris Buschelman       t[idx]   = s1;
2209f26ec98cSKris Buschelman       t[1+idx] = s2;
2210f26ec98cSKris Buschelman       t[2+idx] = s3;
2211f26ec98cSKris Buschelman       t[3+idx] = s4;
2212f26ec98cSKris Buschelman     }
2213f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2214f26ec98cSKris Buschelman     idt = 4*(n-1);
2215f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2216f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2217f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2218f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2219f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2220f26ec98cSKris Buschelman       s1   = t[idt];
2221f26ec98cSKris Buschelman       s2   = t[1+idt];
2222f26ec98cSKris Buschelman       s3   = t[2+idt];
2223f26ec98cSKris Buschelman       s4   = t[3+idt];
2224f26ec98cSKris Buschelman       while (nz--) {
2225f26ec98cSKris Buschelman         idx = 4*(*vi++);
2226f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2227f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2228f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2229f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2230f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2234f26ec98cSKris Buschelman         v    += 16;
2235f26ec98cSKris Buschelman       }
2236f26ec98cSKris Buschelman       v        = aa + ai16;
2237f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2238f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2239f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2240f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2241f26ec98cSKris Buschelman       idt -= 4;
2242f26ec98cSKris Buschelman     }
2243f26ec98cSKris Buschelman   }
2244f26ec98cSKris Buschelman 
2245f26ec98cSKris Buschelman   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2246f26ec98cSKris Buschelman   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2247f26ec98cSKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
2248f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2249f26ec98cSKris Buschelman }
2250f26ec98cSKris Buschelman 
22513660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
22523660e330SKris Buschelman 
22533660e330SKris Buschelman #include PETSC_HAVE_SSE
22546f6a888dSBarry Smith #include "src/vec/vecimpl.h" /* to allow VecGetArrayFast() */
22553660e330SKris Buschelman #undef __FUNCT__
22567cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
22577cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
22583660e330SKris Buschelman {
22593660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
22602aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
22612aa5897fSKris Buschelman   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
22623660e330SKris Buschelman   MatScalar      *aa=a->a;
226387828ca2SBarry Smith   PetscScalar    *x,*b;
22643660e330SKris Buschelman 
22653660e330SKris Buschelman   PetscFunctionBegin;
22663660e330SKris Buschelman   SSE_SCOPE_BEGIN;
22673660e330SKris Buschelman   /*
22683660e330SKris Buschelman      Note: This code currently uses demotion of double
22693660e330SKris Buschelman      to float when performing the mixed-mode computation.
22703660e330SKris Buschelman      This may not be numerically reasonable for all applications.
22713660e330SKris Buschelman   */
22723660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
22733660e330SKris Buschelman 
22746f6a888dSBarry Smith   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
22756f6a888dSBarry Smith   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
22763660e330SKris Buschelman   {
2277eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2278eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
22792aa5897fSKris Buschelman     int            nz,i,idt,ai16;
22802aa5897fSKris Buschelman     unsigned int   jdx,idx;
22812aa5897fSKris Buschelman     unsigned short *vi;
2282eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
22833660e330SKris Buschelman 
2284eb05f457SKris Buschelman     /* First block is the identity. */
22853660e330SKris Buschelman     idx  = 0;
2286eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
22872aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
22883660e330SKris Buschelman 
22893660e330SKris Buschelman     for (i=1; i<n;) {
22903660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
22913660e330SKris Buschelman       vi   =  aj      + ai[i];
22923660e330SKris Buschelman       nz   =  diag[i] - ai[i];
22933660e330SKris Buschelman       idx +=  4;
22943660e330SKris Buschelman 
2295eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2296eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2297eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
22983660e330SKris Buschelman 
22993660e330SKris Buschelman       while (nz--) {
23003660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23012aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
23023660e330SKris Buschelman 
23033660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2304eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
23053660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23063660e330SKris Buschelman 
23073660e330SKris Buschelman           /* First Column */
23083660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23093660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23103660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23113660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23123660e330SKris Buschelman 
23133660e330SKris Buschelman           /* Second Column */
23143660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23153660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23163660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23173660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23183660e330SKris Buschelman 
23193660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23203660e330SKris Buschelman 
23213660e330SKris Buschelman           /* Third Column */
23223660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23233660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23243660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23253660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23263660e330SKris Buschelman 
23273660e330SKris Buschelman           /* Fourth Column */
23283660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23293660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23303660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23313660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23323660e330SKris Buschelman         SSE_INLINE_END_2
23333660e330SKris Buschelman 
23343660e330SKris Buschelman         v  += 16;
23353660e330SKris Buschelman       }
23363660e330SKris Buschelman       v    =  aa + 16*ai[++i];
23373660e330SKris Buschelman       PREFETCH_NTA(v);
2338eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
23393660e330SKris Buschelman     }
2340eb05f457SKris Buschelman 
2341eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2342eb05f457SKris Buschelman 
23433660e330SKris Buschelman     idt  = 4*(n-1);
23443660e330SKris Buschelman     ai16 = 16*diag[n-1];
23453660e330SKris Buschelman     v    = aa + ai16 + 16;
23463660e330SKris Buschelman     for (i=n-1; i>=0;){
23473660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23483660e330SKris Buschelman       vi = aj + diag[i] + 1;
23493660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
23503660e330SKris Buschelman 
2351eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
23523660e330SKris Buschelman 
23533660e330SKris Buschelman       while (nz--) {
23543660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23552aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
23563660e330SKris Buschelman 
23573660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2358eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
23593660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23603660e330SKris Buschelman 
23613660e330SKris Buschelman           /* First Column */
23623660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23633660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23643660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23653660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23663660e330SKris Buschelman 
23673660e330SKris Buschelman           /* Second Column */
23683660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23693660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23703660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23713660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23723660e330SKris Buschelman 
23733660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23743660e330SKris Buschelman 
23753660e330SKris Buschelman           /* Third Column */
23763660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23773660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23783660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23793660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23803660e330SKris Buschelman 
23813660e330SKris Buschelman           /* Fourth Column */
23823660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23833660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23843660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23853660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23863660e330SKris Buschelman         SSE_INLINE_END_2
23873660e330SKris Buschelman         v  += 16;
23883660e330SKris Buschelman       }
23893660e330SKris Buschelman       v    = aa + ai16;
23903660e330SKris Buschelman       ai16 = 16*diag[--i];
23913660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
23923660e330SKris Buschelman       /*
23933660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
23943660e330SKris Buschelman          which was inverted as part of the factorization
23953660e330SKris Buschelman       */
2396eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
23973660e330SKris Buschelman         /* First Column */
23983660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
23993660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
24003660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
24013660e330SKris Buschelman 
24023660e330SKris Buschelman         /* Second Column */
24033660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
24043660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
24053660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
24063660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
24073660e330SKris Buschelman 
24083660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
24093660e330SKris Buschelman 
24103660e330SKris Buschelman         /* Third Column */
24113660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
24123660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
24133660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
24143660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
24153660e330SKris Buschelman 
24163660e330SKris Buschelman         /* Fourth Column */
24173660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
24183660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
24193660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
24203660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
24213660e330SKris Buschelman 
24223660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
24233660e330SKris Buschelman       SSE_INLINE_END_3
24243660e330SKris Buschelman 
24253660e330SKris Buschelman       v    = aa + ai16 + 16;
24263660e330SKris Buschelman       idt -= 4;
24273660e330SKris Buschelman     }
2428eb05f457SKris Buschelman 
2429eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2430eb05f457SKris Buschelman     idt = 4*(n-1);
2431eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2432eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2433eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2434eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2435eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2436eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2437eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2438eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2439eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
244054693613SKris Buschelman       idt -= 4;
24413660e330SKris Buschelman     }
2442eb05f457SKris Buschelman 
2443eb05f457SKris Buschelman   } /* End of artificial scope. */
24446f6a888dSBarry Smith   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
24456f6a888dSBarry Smith   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
24463660e330SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
24473660e330SKris Buschelman   SSE_SCOPE_END;
24483660e330SKris Buschelman   PetscFunctionReturn(0);
24493660e330SKris Buschelman }
24503660e330SKris Buschelman 
24517cf1b8d3SKris Buschelman #undef __FUNCT__
24527cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
24537cf1b8d3SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
24547cf1b8d3SKris Buschelman {
24557cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
24567cf1b8d3SKris Buschelman   int            *aj=a->j;
24577cf1b8d3SKris Buschelman   int            ierr,*ai=a->i,n=a->mbs,*diag = a->diag;
24587cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
24597cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
24607cf1b8d3SKris Buschelman 
24617cf1b8d3SKris Buschelman   PetscFunctionBegin;
24627cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
24637cf1b8d3SKris Buschelman   /*
24647cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
24657cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
24667cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
24677cf1b8d3SKris Buschelman   */
24687cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
24697cf1b8d3SKris Buschelman 
24707cf1b8d3SKris Buschelman   ierr = VecGetArrayFast(bb,&b);CHKERRQ(ierr);
24717cf1b8d3SKris Buschelman   ierr = VecGetArrayFast(xx,&x);CHKERRQ(ierr);
24727cf1b8d3SKris Buschelman   {
24737cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
24747cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
24757cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
24767cf1b8d3SKris Buschelman     int       jdx,idx;
24777cf1b8d3SKris Buschelman     int       *vi;
24787cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
24797cf1b8d3SKris Buschelman 
24807cf1b8d3SKris Buschelman     /* First block is the identity. */
24817cf1b8d3SKris Buschelman     idx  = 0;
24827cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
24837cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
24847cf1b8d3SKris Buschelman 
24857cf1b8d3SKris Buschelman     for (i=1; i<n;) {
24867cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
24877cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
24887cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
24897cf1b8d3SKris Buschelman       idx +=  4;
24907cf1b8d3SKris Buschelman 
24917cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
24927cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
24937cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
24947cf1b8d3SKris Buschelman 
24957cf1b8d3SKris Buschelman       while (nz--) {
24967cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
24977cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
24987cf1b8d3SKris Buschelman /*          jdx = *vi++; */
24997cf1b8d3SKris Buschelman 
25007cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
25017cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
25027cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25037cf1b8d3SKris Buschelman 
25047cf1b8d3SKris Buschelman           /* First Column */
25057cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25067cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25077cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25087cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25097cf1b8d3SKris Buschelman 
25107cf1b8d3SKris Buschelman           /* Second Column */
25117cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25127cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25137cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25147cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25157cf1b8d3SKris Buschelman 
25167cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25177cf1b8d3SKris Buschelman 
25187cf1b8d3SKris Buschelman           /* Third Column */
25197cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25207cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25217cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25227cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25237cf1b8d3SKris Buschelman 
25247cf1b8d3SKris Buschelman           /* Fourth Column */
25257cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25267cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25277cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25287cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25297cf1b8d3SKris Buschelman         SSE_INLINE_END_2
25307cf1b8d3SKris Buschelman 
25317cf1b8d3SKris Buschelman         v  += 16;
25327cf1b8d3SKris Buschelman       }
25337cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
25347cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
25357cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
25367cf1b8d3SKris Buschelman     }
25377cf1b8d3SKris Buschelman 
25387cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
25397cf1b8d3SKris Buschelman 
25407cf1b8d3SKris Buschelman     idt  = 4*(n-1);
25417cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
25427cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
25437cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
25447cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25457cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
25467cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
25477cf1b8d3SKris Buschelman 
25487cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
25497cf1b8d3SKris Buschelman 
25507cf1b8d3SKris Buschelman       while (nz--) {
25517cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25527cf1b8d3SKris Buschelman         idx = 4*(*vi++);
25537cf1b8d3SKris Buschelman /*          idx = *vi++; */
25547cf1b8d3SKris Buschelman 
25557cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
25567cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
25577cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25587cf1b8d3SKris Buschelman 
25597cf1b8d3SKris Buschelman           /* First Column */
25607cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25617cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25627cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25637cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25647cf1b8d3SKris Buschelman 
25657cf1b8d3SKris Buschelman           /* Second Column */
25667cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25677cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25687cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25697cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25707cf1b8d3SKris Buschelman 
25717cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25727cf1b8d3SKris Buschelman 
25737cf1b8d3SKris Buschelman           /* Third Column */
25747cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25757cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25767cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25777cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25787cf1b8d3SKris Buschelman 
25797cf1b8d3SKris Buschelman           /* Fourth Column */
25807cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25817cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25827cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25837cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25847cf1b8d3SKris Buschelman         SSE_INLINE_END_2
25857cf1b8d3SKris Buschelman         v  += 16;
25867cf1b8d3SKris Buschelman       }
25877cf1b8d3SKris Buschelman       v    = aa + ai16;
25887cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
25897cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
25907cf1b8d3SKris Buschelman       /*
25917cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
25927cf1b8d3SKris Buschelman          which was inverted as part of the factorization
25937cf1b8d3SKris Buschelman       */
25947cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
25957cf1b8d3SKris Buschelman         /* First Column */
25967cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
25977cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
25987cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
25997cf1b8d3SKris Buschelman 
26007cf1b8d3SKris Buschelman         /* Second Column */
26017cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
26027cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
26037cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
26047cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
26057cf1b8d3SKris Buschelman 
26067cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
26077cf1b8d3SKris Buschelman 
26087cf1b8d3SKris Buschelman         /* Third Column */
26097cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
26107cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
26117cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
26127cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
26137cf1b8d3SKris Buschelman 
26147cf1b8d3SKris Buschelman         /* Fourth Column */
26157cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
26167cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
26177cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
26187cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
26197cf1b8d3SKris Buschelman 
26207cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
26217cf1b8d3SKris Buschelman       SSE_INLINE_END_3
26227cf1b8d3SKris Buschelman 
26237cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
26247cf1b8d3SKris Buschelman       idt -= 4;
26257cf1b8d3SKris Buschelman     }
26267cf1b8d3SKris Buschelman 
26277cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
26287cf1b8d3SKris Buschelman     idt = 4*(n-1);
26297cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
26307cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
26317cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
26327cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
26337cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
26347cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
26357cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
26367cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
26377cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
26387cf1b8d3SKris Buschelman       idt -= 4;
26397cf1b8d3SKris Buschelman     }
26407cf1b8d3SKris Buschelman 
26417cf1b8d3SKris Buschelman   } /* End of artificial scope. */
26427cf1b8d3SKris Buschelman   ierr = VecRestoreArrayFast(bb,&b);CHKERRQ(ierr);
26437cf1b8d3SKris Buschelman   ierr = VecRestoreArrayFast(xx,&x);CHKERRQ(ierr);
26447cf1b8d3SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
26457cf1b8d3SKris Buschelman   SSE_SCOPE_END;
26467cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
26477cf1b8d3SKris Buschelman }
26487cf1b8d3SKris Buschelman 
26493660e330SKris Buschelman #endif
26504a2ae208SSatish Balay #undef __FUNCT__
26514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
26524e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
26534e2b4712SSatish Balay {
26544e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
26554e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
26564e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
26574e2b4712SSatish Balay   int             *diag = a->diag;
26583f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
265987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
26604e2b4712SSatish Balay 
26614e2b4712SSatish Balay   PetscFunctionBegin;
2662e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2663e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2664f1af5d2fSBarry Smith   t  = a->solve_work;
26654e2b4712SSatish Balay 
26664e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
26674e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26684e2b4712SSatish Balay 
26694e2b4712SSatish Balay   /* forward solve the lower triangular */
26704e2b4712SSatish Balay   idx    = 3*(*r++);
2671f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
26724e2b4712SSatish Balay   for (i=1; i<n; i++) {
26734e2b4712SSatish Balay     v     = aa + 9*ai[i];
26744e2b4712SSatish Balay     vi    = aj + ai[i];
26754e2b4712SSatish Balay     nz    = diag[i] - ai[i];
26764e2b4712SSatish Balay     idx   = 3*(*r++);
2677f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
26784e2b4712SSatish Balay     while (nz--) {
26794e2b4712SSatish Balay       idx   = 3*(*vi++);
2680f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2681f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2682f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2683f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
26844e2b4712SSatish Balay       v += 9;
26854e2b4712SSatish Balay     }
26864e2b4712SSatish Balay     idx = 3*i;
2687f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
26884e2b4712SSatish Balay   }
26894e2b4712SSatish Balay   /* backward solve the upper triangular */
26904e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
26914e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
26924e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26934e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26944e2b4712SSatish Balay     idt  = 3*i;
2695f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
26964e2b4712SSatish Balay     while (nz--) {
26974e2b4712SSatish Balay       idx   = 3*(*vi++);
2698f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2699f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2700f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2701f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27024e2b4712SSatish Balay       v += 9;
27034e2b4712SSatish Balay     }
27044e2b4712SSatish Balay     idc = 3*(*c--);
27054e2b4712SSatish Balay     v   = aa + 9*diag[i];
2706f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2707f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2708f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
27094e2b4712SSatish Balay   }
27104e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27114e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2712e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2713e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2714b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
27154e2b4712SSatish Balay   PetscFunctionReturn(0);
27164e2b4712SSatish Balay }
27174e2b4712SSatish Balay 
271815091d37SBarry Smith /*
271915091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
272015091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
272115091d37SBarry Smith */
27224a2ae208SSatish Balay #undef __FUNCT__
27234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
272415091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
272515091d37SBarry Smith {
272615091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
272715091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
272815091d37SBarry Smith   int             ierr,*diag = a->diag;
272915091d37SBarry Smith   MatScalar       *aa=a->a,*v;
273087828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
273115091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
273215091d37SBarry Smith 
273315091d37SBarry Smith   PetscFunctionBegin;
273415091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
273515091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
273615091d37SBarry Smith 
273715091d37SBarry Smith 
273815091d37SBarry Smith   /* forward solve the lower triangular */
273915091d37SBarry Smith   idx    = 0;
274015091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
274115091d37SBarry Smith   for (i=1; i<n; i++) {
274215091d37SBarry Smith     v     =  aa      + 9*ai[i];
274315091d37SBarry Smith     vi    =  aj      + ai[i];
274415091d37SBarry Smith     nz    =  diag[i] - ai[i];
274515091d37SBarry Smith     idx   +=  3;
2746f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
274715091d37SBarry Smith     while (nz--) {
274815091d37SBarry Smith       jdx   = 3*(*vi++);
274915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2750f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2751f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2752f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
275315091d37SBarry Smith       v    += 9;
275415091d37SBarry Smith     }
2755f1af5d2fSBarry Smith     x[idx]   = s1;
2756f1af5d2fSBarry Smith     x[1+idx] = s2;
2757f1af5d2fSBarry Smith     x[2+idx] = s3;
275815091d37SBarry Smith   }
275915091d37SBarry Smith   /* backward solve the upper triangular */
276015091d37SBarry Smith   for (i=n-1; i>=0; i--){
276115091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
276215091d37SBarry Smith     vi   = aj + diag[i] + 1;
276315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
276415091d37SBarry Smith     idt  = 3*i;
2765f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2766f1af5d2fSBarry Smith     s3 = x[2+idt];
276715091d37SBarry Smith     while (nz--) {
276815091d37SBarry Smith       idx   = 3*(*vi++);
276915091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2770f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2771f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2772f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
277315091d37SBarry Smith       v    += 9;
277415091d37SBarry Smith     }
277515091d37SBarry Smith     v        = aa +  9*diag[i];
2776f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2777f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2778f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
277915091d37SBarry Smith   }
278015091d37SBarry Smith 
278115091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
278215091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2783b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
278415091d37SBarry Smith   PetscFunctionReturn(0);
278515091d37SBarry Smith }
278615091d37SBarry Smith 
27874a2ae208SSatish Balay #undef __FUNCT__
27884a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
27894e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
27904e2b4712SSatish Balay {
27914e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
27924e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
27934e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
27944e2b4712SSatish Balay   int             *diag = a->diag;
27953f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
279687828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
27974e2b4712SSatish Balay 
27984e2b4712SSatish Balay   PetscFunctionBegin;
2799e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2800e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2801f1af5d2fSBarry Smith   t  = a->solve_work;
28024e2b4712SSatish Balay 
28034e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28044e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28054e2b4712SSatish Balay 
28064e2b4712SSatish Balay   /* forward solve the lower triangular */
28074e2b4712SSatish Balay   idx    = 2*(*r++);
2808f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
28094e2b4712SSatish Balay   for (i=1; i<n; i++) {
28104e2b4712SSatish Balay     v     = aa + 4*ai[i];
28114e2b4712SSatish Balay     vi    = aj + ai[i];
28124e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28134e2b4712SSatish Balay     idx   = 2*(*r++);
2814f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
28154e2b4712SSatish Balay     while (nz--) {
28164e2b4712SSatish Balay       idx   = 2*(*vi++);
2817f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2818f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2819f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28204e2b4712SSatish Balay       v += 4;
28214e2b4712SSatish Balay     }
28224e2b4712SSatish Balay     idx = 2*i;
2823f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
28244e2b4712SSatish Balay   }
28254e2b4712SSatish Balay   /* backward solve the upper triangular */
28264e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28274e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
28284e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28294e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28304e2b4712SSatish Balay     idt  = 2*i;
2831f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
28324e2b4712SSatish Balay     while (nz--) {
28334e2b4712SSatish Balay       idx   = 2*(*vi++);
2834f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2835f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2836f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28374e2b4712SSatish Balay       v += 4;
28384e2b4712SSatish Balay     }
28394e2b4712SSatish Balay     idc = 2*(*c--);
28404e2b4712SSatish Balay     v   = aa + 4*diag[i];
2841f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2842f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
28434e2b4712SSatish Balay   }
28444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2846e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2847e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2848b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
28494e2b4712SSatish Balay   PetscFunctionReturn(0);
28504e2b4712SSatish Balay }
28514e2b4712SSatish Balay 
285215091d37SBarry Smith /*
285315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
285415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
285515091d37SBarry Smith */
28564a2ae208SSatish Balay #undef __FUNCT__
28574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
285815091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
285915091d37SBarry Smith {
286015091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
286115091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
286215091d37SBarry Smith   int             ierr,*diag = a->diag;
286315091d37SBarry Smith   MatScalar       *aa=a->a,*v;
286487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2;
286515091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
286615091d37SBarry Smith 
286715091d37SBarry Smith   PetscFunctionBegin;
286815091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
286915091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
287015091d37SBarry Smith 
287115091d37SBarry Smith   /* forward solve the lower triangular */
287215091d37SBarry Smith   idx    = 0;
287315091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
287415091d37SBarry Smith   for (i=1; i<n; i++) {
287515091d37SBarry Smith     v     =  aa      + 4*ai[i];
287615091d37SBarry Smith     vi    =  aj      + ai[i];
287715091d37SBarry Smith     nz    =  diag[i] - ai[i];
287815091d37SBarry Smith     idx   +=  2;
2879f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
288015091d37SBarry Smith     while (nz--) {
288115091d37SBarry Smith       jdx   = 2*(*vi++);
288215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2883f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2884f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
288515091d37SBarry Smith       v    += 4;
288615091d37SBarry Smith     }
2887f1af5d2fSBarry Smith     x[idx]   = s1;
2888f1af5d2fSBarry Smith     x[1+idx] = s2;
288915091d37SBarry Smith   }
289015091d37SBarry Smith   /* backward solve the upper triangular */
289115091d37SBarry Smith   for (i=n-1; i>=0; i--){
289215091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
289315091d37SBarry Smith     vi   = aj + diag[i] + 1;
289415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
289515091d37SBarry Smith     idt  = 2*i;
2896f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
289715091d37SBarry Smith     while (nz--) {
289815091d37SBarry Smith       idx   = 2*(*vi++);
289915091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2900f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2901f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
290215091d37SBarry Smith       v    += 4;
290315091d37SBarry Smith     }
290415091d37SBarry Smith     v        = aa +  4*diag[i];
2905f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2906f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
290715091d37SBarry Smith   }
290815091d37SBarry Smith 
290915091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
291015091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2911b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
291215091d37SBarry Smith   PetscFunctionReturn(0);
291315091d37SBarry Smith }
291415091d37SBarry Smith 
29154a2ae208SSatish Balay #undef __FUNCT__
29164a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
29174e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
29184e2b4712SSatish Balay {
29194e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
29204e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
29214e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
29224e2b4712SSatish Balay   int             *diag = a->diag;
29233f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
292487828ca2SBarry Smith   PetscScalar     *x,*b,s1,*t;
29254e2b4712SSatish Balay 
29264e2b4712SSatish Balay   PetscFunctionBegin;
29274e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
29284e2b4712SSatish Balay 
2929e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2930e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2931f1af5d2fSBarry Smith   t  = a->solve_work;
29324e2b4712SSatish Balay 
29334e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29344e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29354e2b4712SSatish Balay 
29364e2b4712SSatish Balay   /* forward solve the lower triangular */
2937f1af5d2fSBarry Smith   t[0] = b[*r++];
29384e2b4712SSatish Balay   for (i=1; i<n; i++) {
29394e2b4712SSatish Balay     v     = aa + ai[i];
29404e2b4712SSatish Balay     vi    = aj + ai[i];
29414e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2942f1af5d2fSBarry Smith     s1  = b[*r++];
29434e2b4712SSatish Balay     while (nz--) {
2944f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29454e2b4712SSatish Balay     }
2946f1af5d2fSBarry Smith     t[i] = s1;
29474e2b4712SSatish Balay   }
29484e2b4712SSatish Balay   /* backward solve the upper triangular */
29494e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
29504e2b4712SSatish Balay     v    = aa + diag[i] + 1;
29514e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
29524e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2953f1af5d2fSBarry Smith     s1 = t[i];
29544e2b4712SSatish Balay     while (nz--) {
2955f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29564e2b4712SSatish Balay     }
2957f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
29584e2b4712SSatish Balay   }
29594e2b4712SSatish Balay 
29604e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
29614e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2962e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2963e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2964b0a32e0cSBarry Smith   PetscLogFlops(2*1*(a->nz) - A->n);
29654e2b4712SSatish Balay   PetscFunctionReturn(0);
29664e2b4712SSatish Balay }
296715091d37SBarry Smith /*
296815091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
296915091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
297015091d37SBarry Smith */
29714a2ae208SSatish Balay #undef __FUNCT__
29724a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
297315091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
297415091d37SBarry Smith {
297515091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
297615091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
297715091d37SBarry Smith   int             ierr,*diag = a->diag;
297815091d37SBarry Smith   MatScalar       *aa=a->a;
297987828ca2SBarry Smith   PetscScalar     *x,*b;
298087828ca2SBarry Smith   PetscScalar     s1,x1;
298115091d37SBarry Smith   MatScalar       *v;
298215091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
298315091d37SBarry Smith 
298415091d37SBarry Smith   PetscFunctionBegin;
298515091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
298615091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
298715091d37SBarry Smith 
298815091d37SBarry Smith   /* forward solve the lower triangular */
298915091d37SBarry Smith   idx    = 0;
299015091d37SBarry Smith   x[0]   = b[0];
299115091d37SBarry Smith   for (i=1; i<n; i++) {
299215091d37SBarry Smith     v     =  aa      + ai[i];
299315091d37SBarry Smith     vi    =  aj      + ai[i];
299415091d37SBarry Smith     nz    =  diag[i] - ai[i];
299515091d37SBarry Smith     idx   +=  1;
2996f1af5d2fSBarry Smith     s1  =  b[idx];
299715091d37SBarry Smith     while (nz--) {
299815091d37SBarry Smith       jdx   = *vi++;
299915091d37SBarry Smith       x1    = x[jdx];
3000f1af5d2fSBarry Smith       s1 -= v[0]*x1;
300115091d37SBarry Smith       v    += 1;
300215091d37SBarry Smith     }
3003f1af5d2fSBarry Smith     x[idx]   = s1;
300415091d37SBarry Smith   }
300515091d37SBarry Smith   /* backward solve the upper triangular */
300615091d37SBarry Smith   for (i=n-1; i>=0; i--){
300715091d37SBarry Smith     v    = aa + diag[i] + 1;
300815091d37SBarry Smith     vi   = aj + diag[i] + 1;
300915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
301015091d37SBarry Smith     idt  = i;
3011f1af5d2fSBarry Smith     s1 = x[idt];
301215091d37SBarry Smith     while (nz--) {
301315091d37SBarry Smith       idx   = *vi++;
301415091d37SBarry Smith       x1    = x[idx];
3015f1af5d2fSBarry Smith       s1 -= v[0]*x1;
301615091d37SBarry Smith       v    += 1;
301715091d37SBarry Smith     }
301815091d37SBarry Smith     v        = aa +  diag[i];
3019f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
302015091d37SBarry Smith   }
302115091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
302215091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3023b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
302415091d37SBarry Smith   PetscFunctionReturn(0);
302515091d37SBarry Smith }
30264e2b4712SSatish Balay 
30274e2b4712SSatish Balay /* ----------------------------------------------------------------*/
30284e2b4712SSatish Balay /*
30294e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
30304e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
30314e2b4712SSatish Balay    Not a good example of code reuse.
30324e2b4712SSatish Balay */
3033ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
3034435faa5fSBarry Smith 
30354a2ae208SSatish Balay #undef __FUNCT__
30364a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3037435faa5fSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
30384e2b4712SSatish Balay {
30394e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
30404e2b4712SSatish Balay   IS          isicol;
30414e2b4712SSatish Balay   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
30424e2b4712SSatish Balay   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
3043eb150c5cSKris Buschelman   int         *dloc,idx,row,m,fm,nzf,nzi,len, reallocate = 0,dcount = 0;
3044435faa5fSBarry Smith   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
30454533b203SBarry Smith   PetscTruth  col_identity,row_identity;
3046329f5518SBarry Smith   PetscReal   f;
30474e2b4712SSatish Balay 
30484e2b4712SSatish Balay   PetscFunctionBegin;
3049435faa5fSBarry Smith   f             = info->fill;
3050335d9088SBarry Smith   levels        = (int)info->levels;
3051335d9088SBarry Smith   diagonal_fill = (int)info->diagonal_fill;
30524c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3053667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3054667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3055309c388cSBarry Smith 
3056309c388cSBarry Smith   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
3057bb3d539aSBarry Smith     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
3058bb3d539aSBarry Smith     (*fact)->factor = FACTOR_LU;
3059bb3d539aSBarry Smith     b               = (Mat_SeqBAIJ*)(*fact)->data;
3060bb3d539aSBarry Smith     if (!b->diag) {
3061bb3d539aSBarry Smith       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3062bb3d539aSBarry Smith     }
3063bb3d539aSBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
3064bb3d539aSBarry Smith     b->row        = isrow;
3065bb3d539aSBarry Smith     b->col        = iscol;
3066bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3067bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3068bb3d539aSBarry Smith     b->icol       = isicol;
3069bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
307087828ca2SBarry Smith     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3071309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
30724e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
30734e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
30744e2b4712SSatish Balay 
30754e2b4712SSatish Balay     /* get new row pointers */
3076b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
30774e2b4712SSatish Balay     ainew[0] = 0;
30784e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
30794e2b4712SSatish Balay     jmax = (int)(f*ai[n] + 1);
308082502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
30814e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
308282502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
30834e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3084b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
30854e2b4712SSatish Balay     /* im is level for each filled value */
3086b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
30874e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3088b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
30894e2b4712SSatish Balay     dloc[0]  = 0;
30904e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3091435faa5fSBarry Smith 
3092435faa5fSBarry Smith       /* copy prow into linked list */
30934e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
309429bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
30954e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
30964e2b4712SSatish Balay       fill[n]    = n;
3097435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
30984e2b4712SSatish Balay       while (nz--) {
30994e2b4712SSatish Balay 	fm  = n;
31004e2b4712SSatish Balay 	idx = ic[*xi++];
31014e2b4712SSatish Balay 	do {
31024e2b4712SSatish Balay 	  m  = fm;
31034e2b4712SSatish Balay 	  fm = fill[m];
31044e2b4712SSatish Balay 	} while (fm < idx);
31054e2b4712SSatish Balay 	fill[m]   = idx;
31064e2b4712SSatish Balay 	fill[idx] = fm;
31074e2b4712SSatish Balay 	im[idx]   = 0;
31084e2b4712SSatish Balay       }
3109435faa5fSBarry Smith 
3110435faa5fSBarry Smith       /* make sure diagonal entry is included */
3111435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3112435faa5fSBarry Smith 	fm = n;
3113435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3114435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3115435faa5fSBarry Smith 	fill[fm]   = prow;
3116435faa5fSBarry Smith 	im[prow]   = 0;
3117435faa5fSBarry Smith 	nzf++;
3118335d9088SBarry Smith 	dcount++;
3119435faa5fSBarry Smith       }
3120435faa5fSBarry Smith 
31214e2b4712SSatish Balay       nzi = 0;
31224e2b4712SSatish Balay       row = fill[n];
31234e2b4712SSatish Balay       while (row < prow) {
31244e2b4712SSatish Balay 	incrlev = im[row] + 1;
31254e2b4712SSatish Balay 	nz      = dloc[row];
3126435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
31274e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
31284e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
31294e2b4712SSatish Balay 	fm      = row;
31304e2b4712SSatish Balay 	while (nnz-- > 0) {
31314e2b4712SSatish Balay 	  idx = *xi++;
31324e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
31334e2b4712SSatish Balay 	    flev++;
31344e2b4712SSatish Balay 	    continue;
31354e2b4712SSatish Balay 	  }
31364e2b4712SSatish Balay 	  do {
31374e2b4712SSatish Balay 	    m  = fm;
31384e2b4712SSatish Balay 	    fm = fill[m];
31394e2b4712SSatish Balay 	  } while (fm < idx);
31404e2b4712SSatish Balay 	  if (fm != idx) {
31414e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
31424e2b4712SSatish Balay 	    fill[m]   = idx;
31434e2b4712SSatish Balay 	    fill[idx] = fm;
31444e2b4712SSatish Balay 	    fm        = idx;
31454e2b4712SSatish Balay 	    nzf++;
3146ecf371e4SBarry Smith 	  } else {
31474e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
31484e2b4712SSatish Balay 	  }
31494e2b4712SSatish Balay 	  flev++;
31504e2b4712SSatish Balay 	}
31514e2b4712SSatish Balay 	row = fill[row];
31524e2b4712SSatish Balay 	nzi++;
31534e2b4712SSatish Balay       }
31544e2b4712SSatish Balay       /* copy new filled row into permanent storage */
31554e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
31564e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3157ecf371e4SBarry Smith 
3158ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3159ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3160ecf371e4SBarry Smith 	/* just double the memory each time */
3161ecf371e4SBarry Smith 	int maxadd = jmax;
3162ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
31634e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
31644e2b4712SSatish Balay 	jmax += maxadd;
3165ecf371e4SBarry Smith 
3166ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
316782502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3168549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3169606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
31704e2b4712SSatish Balay 	ajnew = xi;
317182502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
3172549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
3173606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
31744e2b4712SSatish Balay 	ajfill = xi;
3175eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
31764e2b4712SSatish Balay       }
31774e2b4712SSatish Balay       xi          = ajnew + ainew[prow];
31784e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
31794e2b4712SSatish Balay       dloc[prow]  = nzi;
31804e2b4712SSatish Balay       fm          = fill[n];
31814e2b4712SSatish Balay       while (nzf--) {
31824e2b4712SSatish Balay 	*xi++   = fm;
31834e2b4712SSatish Balay 	*flev++ = im[fm];
31844e2b4712SSatish Balay 	fm      = fill[fm];
31854e2b4712SSatish Balay       }
3186435faa5fSBarry Smith       /* make sure row has diagonal entry */
3187435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
318829bbc08cSBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
3189435faa5fSBarry Smith     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
3190435faa5fSBarry Smith       }
31914e2b4712SSatish Balay     }
3192606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
31934e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
31944e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3195606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3196606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
31974e2b4712SSatish Balay 
31984e2b4712SSatish Balay     {
3199329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3200eb150c5cSKris Buschelman       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",reallocate,f,af);
3201b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
3202b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
3203b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
3204335d9088SBarry Smith       if (diagonal_fill) {
3205b1bcba4aSBarry Smith 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replaced %d missing diagonals",dcount);
3206335d9088SBarry Smith       }
32074e2b4712SSatish Balay     }
32084e2b4712SSatish Balay 
32094e2b4712SSatish Balay     /* put together the new matrix */
32104e2b4712SSatish Balay     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
3211b0a32e0cSBarry Smith     PetscLogObjectParent(*fact,isicol);
32124e2b4712SSatish Balay     b = (Mat_SeqBAIJ*)(*fact)->data;
3213606d414cSSatish Balay     ierr = PetscFree(b->imax);CHKERRQ(ierr);
32147c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
32153f1db9ecSBarry Smith     len = bs2*ainew[n]*sizeof(MatScalar);
32164e2b4712SSatish Balay     /* the next line frees the default space generated by the Create() */
3217606d414cSSatish Balay     ierr = PetscFree(b->a);CHKERRQ(ierr);
3218606d414cSSatish Balay     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
321982502324SSatish Balay     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
32204e2b4712SSatish Balay     b->j          = ajnew;
32214e2b4712SSatish Balay     b->i          = ainew;
32224e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
32234e2b4712SSatish Balay     b->diag       = dloc;
32244e2b4712SSatish Balay     b->ilen       = 0;
32254e2b4712SSatish Balay     b->imax       = 0;
32264e2b4712SSatish Balay     b->row        = isrow;
32274e2b4712SSatish Balay     b->col        = iscol;
3228bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3229c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3230c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3231e51c0b9cSSatish Balay     b->icol       = isicol;
323287828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
32334e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
32344e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
323587828ca2SBarry Smith     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
32364e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
32374e2b4712SSatish Balay     (*fact)->factor   = FACTOR_LU;
32384e2b4712SSatish Balay 
3239eb150c5cSKris Buschelman     (*fact)->info.factor_mallocs    = reallocate;
32404e2b4712SSatish Balay     (*fact)->info.fill_ratio_given  = f;
3241329f5518SBarry Smith     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3242309c388cSBarry Smith   }
32434e2b4712SSatish Balay 
3244309c388cSBarry Smith   if (row_identity && col_identity) {
3245732ee342SKris Buschelman     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
32468661488fSKris Buschelman   }
32478661488fSKris Buschelman   PetscFunctionReturn(0);
32488661488fSKris Buschelman }
32498661488fSKris Buschelman 
3250732ee342SKris Buschelman #undef __FUNCT__
32517e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
32527e7071cdSKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
32537e7071cdSKris Buschelman {
32547e7071cdSKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
32552aa5897fSKris Buschelman   int i,*AJ=a->j,nz=a->nz;
3256*5a9542e3SKris Buschelman   PetscFunctionBegin;
32577cf1b8d3SKris Buschelman   /* Undo Column scaling */
32587cf1b8d3SKris Buschelman /*    while (nz--) { */
32597cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
32607cf1b8d3SKris Buschelman /*    } */
32617cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
32627cf1b8d3SKris Buschelman }
32637cf1b8d3SKris Buschelman 
32647cf1b8d3SKris Buschelman #undef __FUNCT__
32657cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
32667cf1b8d3SKris Buschelman int MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
32677cf1b8d3SKris Buschelman {
32687cf1b8d3SKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data;
32697cf1b8d3SKris Buschelman   int i,*AJ=a->j,nz=a->nz;
32702aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
3271*5a9542e3SKris Buschelman   PetscFunctionBegin;
327220235379SKris Buschelman   while (nz--) {
32732aa5897fSKris Buschelman     AJ[i] = (int)((unsigned int)aj[i]); /* First extend, then convert to signed. */
32747e7071cdSKris Buschelman   }
32757e7071cdSKris Buschelman   PetscFunctionReturn(0);
32767e7071cdSKris Buschelman }
32777e7071cdSKris Buschelman 
32787e7071cdSKris Buschelman #undef __FUNCT__
3279732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
3280732ee342SKris Buschelman int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
32818661488fSKris Buschelman {
32828661488fSKris Buschelman   /*
32838661488fSKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
32848661488fSKris Buschelman       with natural ordering
32858661488fSKris Buschelman   */
32868661488fSKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
32878661488fSKris Buschelman 
32888661488fSKris Buschelman   PetscFunctionBegin;
3289a7ba9c3cSKris Buschelman   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
3290a7ba9c3cSKris Buschelman   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
32918661488fSKris Buschelman   switch (a->bs) {
32928661488fSKris Buschelman   case 1:
32938661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
3294732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
3295732ee342SKris Buschelman     break;
3296309c388cSBarry Smith   case 2:
32978661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
3298732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
3299309c388cSBarry Smith     break;
3300309c388cSBarry Smith   case 3:
33018661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
3302732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
3303309c388cSBarry Smith     break;
3304309c388cSBarry Smith   case 4:
3305a7d8d0baSKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3306a7d8d0baSKris Buschelman     {
3307a7d8d0baSKris Buschelman       PetscTruth  sse_enabled_local;
330843b9cc93SKris Buschelman       int         ierr;
3309ccaa8a1bSKris Buschelman       ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
33106b7cc795SKris Buschelman       if (sse_enabled_local) {
3311b988c221SKris Buschelman #  if defined(PETSC_HAVE_SSE)
33127cf1b8d3SKris Buschelman         int i,*AJ=a->j,nz=a->nz,n=a->mbs;
33137cf1b8d3SKris Buschelman         if (n==(unsigned short)n) {
33142aa5897fSKris Buschelman           unsigned short *aj=(unsigned short *)AJ;
331513c7ffeeSKris Buschelman           for (i=0;i<nz;i++) {
33162aa5897fSKris Buschelman             aj[i] = (unsigned short)AJ[i];
331713c7ffeeSKris Buschelman           }
33187cf1b8d3SKris Buschelman           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj;
33197cf1b8d3SKris Buschelman           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE_usj;
332086b4ebfeSKris Buschelman           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, ushort j index factor BS=4\n");
33217cf1b8d3SKris Buschelman         } else {
33227cf1b8d3SKris Buschelman         /* Scale the column indices for easier indexing in MatSolve. */
33237cf1b8d3SKris Buschelman /*            for (i=0;i<nz;i++) { */
33247cf1b8d3SKris Buschelman /*              AJ[i] = AJ[i]*4; */
33257cf1b8d3SKris Buschelman /*            } */
33267e7071cdSKris Buschelman           inA->ops->setunfactored   = MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE;
33278661488fSKris Buschelman           inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
332886b4ebfeSKris Buschelman           PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering, int j index factor BS=4\n");
33297cf1b8d3SKris Buschelman         }
3330b988c221SKris Buschelman #  else
3331b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3332b988c221SKris Buschelman         SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3333b988c221SKris Buschelman #  endif
33343ba47ebaSKris Buschelman       } else {
33358661488fSKris Buschelman         inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3336732ee342SKris Buschelman         PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
33373ba47ebaSKris Buschelman       }
3338a7d8d0baSKris Buschelman     }
3339a7d8d0baSKris Buschelman #else
3340a7d8d0baSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
3341a7d8d0baSKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
3342a7d8d0baSKris Buschelman #endif
3343309c388cSBarry Smith     break;
3344309c388cSBarry Smith   case 5:
33458661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
3346732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
3347309c388cSBarry Smith     break;
3348309c388cSBarry Smith   case 6:
33498661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
3350732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
3351309c388cSBarry Smith     break;
3352309c388cSBarry Smith   case 7:
33538661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
3354732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
3355309c388cSBarry Smith     break;
3356309c388cSBarry Smith   }
33574e2b4712SSatish Balay   PetscFunctionReturn(0);
33584e2b4712SSatish Balay }
3359732ee342SKris Buschelman 
3360732ee342SKris Buschelman #undef __FUNCT__
3361732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
3362732ee342SKris Buschelman int MatSeqBAIJ_UpdateSolvers(Mat A)
3363732ee342SKris Buschelman {
3364732ee342SKris Buschelman   /*
3365732ee342SKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
3366732ee342SKris Buschelman       with natural ordering
3367732ee342SKris Buschelman   */
3368732ee342SKris Buschelman   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
3369732ee342SKris Buschelman   IS          row = a->row, col = a->col;
3370732ee342SKris Buschelman   PetscTruth  row_identity, col_identity;
337123c42b7cSKris Buschelman   PetscTruth  use_natural;
3372732ee342SKris Buschelman   int         ierr;
3373732ee342SKris Buschelman 
3374732ee342SKris Buschelman   PetscFunctionBegin;
3375cf242676SKris Buschelman 
337694ee7fc8SKris Buschelman   use_natural = PETSC_FALSE;
3377cf242676SKris Buschelman 
3378732ee342SKris Buschelman   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
3379732ee342SKris Buschelman   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
3380732ee342SKris Buschelman 
3381732ee342SKris Buschelman   if (row_identity && col_identity) {
3382732ee342SKris Buschelman     use_natural = PETSC_TRUE;
3383732ee342SKris Buschelman   } else {
3384732ee342SKris Buschelman     use_natural = PETSC_FALSE;
3385732ee342SKris Buschelman   }
3386732ee342SKris Buschelman   switch (a->bs) {
3387732ee342SKris Buschelman   case 1:
3388732ee342SKris Buschelman     if (use_natural) {
3389732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
3390732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
3391732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
3392732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3393732ee342SKris Buschelman     } else {
3394732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1;
3395732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
3396732ee342SKris Buschelman     }
3397732ee342SKris Buschelman     break;
3398732ee342SKris Buschelman   case 2:
3399732ee342SKris Buschelman     if (use_natural) {
3400732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
3401732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
3402732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
3403732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3404732ee342SKris Buschelman     } else {
3405732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2;
3406732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
3407732ee342SKris Buschelman     }
3408732ee342SKris Buschelman     break;
3409732ee342SKris Buschelman   case 3:
3410732ee342SKris Buschelman     if (use_natural) {
3411732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
3412732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
3413732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
3414732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3415732ee342SKris Buschelman     } else {
3416732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3;
3417732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
3418732ee342SKris Buschelman     }
3419732ee342SKris Buschelman     break;
3420732ee342SKris Buschelman   case 4:
3421f26ec98cSKris Buschelman     {
3422123145dfSKris Buschelman       PetscTruth sse_enabled_local;
3423ccaa8a1bSKris Buschelman       ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,PETSC_NULL);CHKERRQ(ierr);
3424732ee342SKris Buschelman       if (use_natural) {
34252859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3426f26ec98cSKris Buschelman         if (sse_enabled_local) { /* Natural + Single + SSE */
3427eb150c5cSKris Buschelman #  if defined(PETSC_HAVE_SSE)
3428995eb297SKris Buschelman           int n=a->mbs;
3429995eb297SKris Buschelman           if (n==(unsigned short)n) {
3430995eb297SKris Buschelman             A->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj;
3431995eb297SKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, ushort j index, natural ordering solve BS=4\n");
3432995eb297SKris Buschelman           } else {
3433732ee342SKris Buschelman             A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
343486b4ebfeSKris Buschelman             PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place, int j index, natural ordering solve BS=4\n");
3435995eb297SKris Buschelman           }
3436eb150c5cSKris Buschelman #  else
3437eb150c5cSKris Buschelman           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3438eb150c5cSKris Buschelman           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3439eb150c5cSKris Buschelman #  endif
3440f26ec98cSKris Buschelman         } else { /* Natural + Single */
3441f26ec98cSKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion;
3442123145dfSKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, in-place, natural ordering solve BS=4\n");
3443f26ec98cSKris Buschelman         }
34442859b196SKris Buschelman #else
34452859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3446123145dfSKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
34472859b196SKris Buschelman #endif
3448732ee342SKris Buschelman         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3449123145dfSKris Buschelman         PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place, natural ordering solve BS=4\n");
3450f26ec98cSKris Buschelman       } else { /* Arbitrary ordering */
34512859b196SKris Buschelman #if defined(PETSC_USE_MAT_SINGLE)
3452f26ec98cSKris Buschelman         if (sse_enabled_local) { /* Arbitrary + Single + SSE */
3453eb150c5cSKris Buschelman #  if defined(PETSC_HAVE_SSE)
3454732ee342SKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_SSE_Demotion;
3455732ee342SKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3456eb150c5cSKris Buschelman #  else
3457eb150c5cSKris Buschelman           /* This should never be reached, unless there is a bug in PetscSSEIsEnabled(). */
3458eb150c5cSKris Buschelman           SETERRQ(PETSC_ERR_SUP,"SSE implementations are unavailable.");
3459eb150c5cSKris Buschelman #  endif
3460f26ec98cSKris Buschelman         } else { /* Arbitrary + Single */
3461f26ec98cSKris Buschelman           A->ops->solve         = MatSolve_SeqBAIJ_4_Demotion;
3462f26ec98cSKris Buschelman           PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision solve BS=4\n");
3463732ee342SKris Buschelman         }
34642859b196SKris Buschelman #else
34652859b196SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4;
34662859b196SKris Buschelman #endif
3467732ee342SKris Buschelman         A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3468732ee342SKris Buschelman       }
3469f26ec98cSKris Buschelman     }
3470732ee342SKris Buschelman     break;
3471732ee342SKris Buschelman   case 5:
3472732ee342SKris Buschelman     if (use_natural) {
3473732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3474732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3475732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3476732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3477732ee342SKris Buschelman     } else {
3478732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5;
3479732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3480732ee342SKris Buschelman     }
3481732ee342SKris Buschelman     break;
3482732ee342SKris Buschelman   case 6:
3483732ee342SKris Buschelman     if (use_natural) {
3484732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3485732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3486732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3487732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3488732ee342SKris Buschelman     } else {
3489732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6;
3490732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3491732ee342SKris Buschelman     }
3492732ee342SKris Buschelman     break;
3493732ee342SKris Buschelman   case 7:
3494732ee342SKris Buschelman     if (use_natural) {
3495732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3496732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3497732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3498732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3499732ee342SKris Buschelman     } else {
3500732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7;
3501732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3502732ee342SKris Buschelman     }
3503732ee342SKris Buschelman     break;
350431801e53SKris Buschelman   default:
350531801e53SKris Buschelman     A->ops->solve             = MatSolve_SeqBAIJ_N;
350631801e53SKris Buschelman     break;
3507732ee342SKris Buschelman   }
3508732ee342SKris Buschelman   PetscFunctionReturn(0);
3509732ee342SKris Buschelman }
3510732ee342SKris Buschelman 
3511732ee342SKris Buschelman #undef __FUNCT__
3512732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3513732ee342SKris Buschelman int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3514732ee342SKris Buschelman   int ierr;
3515732ee342SKris Buschelman 
3516732ee342SKris Buschelman   PetscFunctionBegin;
3517732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3518cf242676SKris Buschelman   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3519732ee342SKris Buschelman     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3520cf242676SKris Buschelman   } else {
3521cf242676SKris Buschelman     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3522cf242676SKris Buschelman   }
3523732ee342SKris Buschelman   PetscFunctionReturn(0);
3524732ee342SKris Buschelman }
3525732ee342SKris Buschelman 
3526732ee342SKris Buschelman #undef __FUNCT__
3527732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3528732ee342SKris Buschelman int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3529732ee342SKris Buschelman   int ierr;
3530732ee342SKris Buschelman 
3531732ee342SKris Buschelman   PetscFunctionBegin;
3532732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3533732ee342SKris Buschelman   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3534732ee342SKris Buschelman   PetscFunctionReturn(0);
3535732ee342SKris Buschelman }
3536732ee342SKris Buschelman 
3537732ee342SKris Buschelman 
3538