xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision ef66eb6987ddfdf4e414d6b820cbc8d8d7d17bc2)
1*ef66eb69SBarry Smith /*$Id: baijfact2.c,v 1.70 2001/08/07 03:02:55 balay Exp bsmith $*/
24e2b4712SSatish Balay /*
34e2b4712SSatish Balay     Factorization code for BAIJ format.
44e2b4712SSatish Balay */
54e2b4712SSatish Balay 
64e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
74e2b4712SSatish Balay #include "src/vec/vecimpl.h"
84e2b4712SSatish Balay #include "src/inline/ilu.h"
974c49faeSBarry Smith #include "src/inline/dot.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
137c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
16f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17f1af5d2fSBarry Smith   int             *diag = a->diag;
18f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
1987828ca2SBarry Smith   PetscScalar     s1,*x,*b;
20f1af5d2fSBarry Smith 
21f1af5d2fSBarry Smith   PetscFunctionBegin;
22*ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
23f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25f1af5d2fSBarry Smith 
26f1af5d2fSBarry Smith   /* forward solve the U^T */
27f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith     v     = aa + diag[i];
30f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
31*ef66eb69SBarry Smith     s1    = (*v++)*x[i];
32f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
33f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
34f1af5d2fSBarry Smith     while (nz--) {
35f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
36f1af5d2fSBarry Smith     }
37f1af5d2fSBarry Smith     x[i]   = s1;
38f1af5d2fSBarry Smith   }
39f1af5d2fSBarry Smith   /* backward solve the L^T */
40f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
41f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
42f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
43f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
44f1af5d2fSBarry Smith     s1   = x[i];
45f1af5d2fSBarry Smith     while (nz--) {
46f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
47f1af5d2fSBarry Smith     }
48f1af5d2fSBarry Smith   }
49f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
52f1af5d2fSBarry Smith   PetscFunctionReturn(0);
53f1af5d2fSBarry Smith }
54f1af5d2fSBarry Smith 
554a2ae208SSatish Balay #undef __FUNCT__
564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58f1af5d2fSBarry Smith {
59f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
60f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
61f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
62f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
6387828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
6487828ca2SBarry Smith   PetscScalar     *x,*b;
65f1af5d2fSBarry Smith 
66f1af5d2fSBarry Smith   PetscFunctionBegin;
67*ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
68f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
69f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
70f1af5d2fSBarry Smith 
71f1af5d2fSBarry Smith   /* forward solve the U^T */
72f1af5d2fSBarry Smith   idx = 0;
73f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
76f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
77*ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
78f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
79f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
80f1af5d2fSBarry Smith     v += 4;
81f1af5d2fSBarry Smith 
82f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
83f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
84f1af5d2fSBarry Smith     while (nz--) {
85f1af5d2fSBarry Smith       oidx = 2*(*vi++);
86f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
87f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
88f1af5d2fSBarry Smith       v  += 4;
89f1af5d2fSBarry Smith     }
90f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
91f1af5d2fSBarry Smith     idx += 2;
92f1af5d2fSBarry Smith   }
93f1af5d2fSBarry Smith   /* backward solve the L^T */
94f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
95f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
96f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
97f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
98f1af5d2fSBarry Smith     idt  = 2*i;
99f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
100f1af5d2fSBarry Smith     while (nz--) {
101f1af5d2fSBarry Smith       idx   = 2*(*vi--);
102f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
103f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
104f1af5d2fSBarry Smith       v -= 4;
105f1af5d2fSBarry Smith     }
106f1af5d2fSBarry Smith   }
107f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
108f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
110f1af5d2fSBarry Smith   PetscFunctionReturn(0);
111f1af5d2fSBarry Smith }
112f1af5d2fSBarry Smith 
1134a2ae208SSatish Balay #undef __FUNCT__
1144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
1157c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
116f1af5d2fSBarry Smith {
117f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
118f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
119f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
120f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
12187828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
12287828ca2SBarry Smith   PetscScalar     *x,*b;
123f1af5d2fSBarry Smith 
124f1af5d2fSBarry Smith   PetscFunctionBegin;
125*ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
126f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
127f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   /* forward solve the U^T */
130f1af5d2fSBarry Smith   idx = 0;
131f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
132f1af5d2fSBarry Smith 
133f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
134f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
135*ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
136f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
137f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
138f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
139f1af5d2fSBarry Smith     v += 9;
140f1af5d2fSBarry Smith 
141f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
142f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
143f1af5d2fSBarry Smith     while (nz--) {
144f1af5d2fSBarry Smith       oidx = 3*(*vi++);
145f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
146f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
147f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
148f1af5d2fSBarry Smith       v  += 9;
149f1af5d2fSBarry Smith     }
150f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
151f1af5d2fSBarry Smith     idx += 3;
152f1af5d2fSBarry Smith   }
153f1af5d2fSBarry Smith   /* backward solve the L^T */
154f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
155f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
156f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
157f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
158f1af5d2fSBarry Smith     idt  = 3*i;
159f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
160f1af5d2fSBarry Smith     while (nz--) {
161f1af5d2fSBarry Smith       idx   = 3*(*vi--);
162f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
163f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
164f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
165f1af5d2fSBarry Smith       v -= 9;
166f1af5d2fSBarry Smith     }
167f1af5d2fSBarry Smith   }
168f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
169f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
171f1af5d2fSBarry Smith   PetscFunctionReturn(0);
172f1af5d2fSBarry Smith }
173f1af5d2fSBarry Smith 
1744a2ae208SSatish Balay #undef __FUNCT__
1754a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
1767c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
177f1af5d2fSBarry Smith {
178f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
179f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
181f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
18287828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
18387828ca2SBarry Smith   PetscScalar     *x,*b;
184f1af5d2fSBarry Smith 
185f1af5d2fSBarry Smith   PetscFunctionBegin;
186*ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
187f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
188f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189f1af5d2fSBarry Smith 
190f1af5d2fSBarry Smith   /* forward solve the U^T */
191f1af5d2fSBarry Smith   idx = 0;
192f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
195f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
196*ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
197f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
198f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
199f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
200f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
201f1af5d2fSBarry Smith     v += 16;
202f1af5d2fSBarry Smith 
203f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
204f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
205f1af5d2fSBarry Smith     while (nz--) {
206f1af5d2fSBarry Smith       oidx = 4*(*vi++);
207f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
208f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
209f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
210f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
211f1af5d2fSBarry Smith       v  += 16;
212f1af5d2fSBarry Smith     }
213f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
214f1af5d2fSBarry Smith     idx += 4;
215f1af5d2fSBarry Smith   }
216f1af5d2fSBarry Smith   /* backward solve the L^T */
217f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
218f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
219f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
220f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
221f1af5d2fSBarry Smith     idt  = 4*i;
222f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
223f1af5d2fSBarry Smith     while (nz--) {
224f1af5d2fSBarry Smith       idx   = 4*(*vi--);
225f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
226f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
227f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
228f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
229f1af5d2fSBarry Smith       v -= 16;
230f1af5d2fSBarry Smith     }
231f1af5d2fSBarry Smith   }
232f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
233f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
235f1af5d2fSBarry Smith   PetscFunctionReturn(0);
236f1af5d2fSBarry Smith }
237f1af5d2fSBarry Smith 
2384a2ae208SSatish Balay #undef __FUNCT__
2394a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
2407c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
241f1af5d2fSBarry Smith {
242f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
243f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
244f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
245f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
24687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
24787828ca2SBarry Smith   PetscScalar     *x,*b;
248f1af5d2fSBarry Smith 
249f1af5d2fSBarry Smith   PetscFunctionBegin;
250*ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
251f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
252f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253f1af5d2fSBarry Smith 
254f1af5d2fSBarry Smith   /* forward solve the U^T */
255f1af5d2fSBarry Smith   idx = 0;
256f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
257f1af5d2fSBarry Smith 
258f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
259f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
260*ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
261f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
262f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
263f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
264f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
265f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
266f1af5d2fSBarry Smith     v += 25;
267f1af5d2fSBarry Smith 
268f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
269f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
270f1af5d2fSBarry Smith     while (nz--) {
271f1af5d2fSBarry Smith       oidx = 5*(*vi++);
272f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
273f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
274f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
275f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
276f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
277f1af5d2fSBarry Smith       v  += 25;
278f1af5d2fSBarry Smith     }
279f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
280f1af5d2fSBarry Smith     idx += 5;
281f1af5d2fSBarry Smith   }
282f1af5d2fSBarry Smith   /* backward solve the L^T */
283f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
284f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
285f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
286f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
287f1af5d2fSBarry Smith     idt  = 5*i;
288f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
289f1af5d2fSBarry Smith     while (nz--) {
290f1af5d2fSBarry Smith       idx   = 5*(*vi--);
291f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
292f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
293f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
294f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
295f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
296f1af5d2fSBarry Smith       v -= 25;
297f1af5d2fSBarry Smith     }
298f1af5d2fSBarry Smith   }
299f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
300f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
301b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
302f1af5d2fSBarry Smith   PetscFunctionReturn(0);
303f1af5d2fSBarry Smith }
304f1af5d2fSBarry Smith 
3054a2ae208SSatish Balay #undef __FUNCT__
3064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
3077c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
308f1af5d2fSBarry Smith {
309f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
310f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
311f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
312f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
31387828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
31487828ca2SBarry Smith   PetscScalar     *x,*b;
315f1af5d2fSBarry Smith 
316f1af5d2fSBarry Smith   PetscFunctionBegin;
317*ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
318f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
319f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith   /* forward solve the U^T */
322f1af5d2fSBarry Smith   idx = 0;
323f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
324f1af5d2fSBarry Smith 
325f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
326f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
327*ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
328*ef66eb69SBarry Smith     x6    = x[5+idx];
329f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
330f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
331f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
332f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
333f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
334f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
335f1af5d2fSBarry Smith     v += 36;
336f1af5d2fSBarry Smith 
337f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
338f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
339f1af5d2fSBarry Smith     while (nz--) {
340f1af5d2fSBarry Smith       oidx = 6*(*vi++);
341f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
342f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
343f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
344f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
345f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
346f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
347f1af5d2fSBarry Smith       v  += 36;
348f1af5d2fSBarry Smith     }
349f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
350f1af5d2fSBarry Smith     x[5+idx] = s6;
351f1af5d2fSBarry Smith     idx += 6;
352f1af5d2fSBarry Smith   }
353f1af5d2fSBarry Smith   /* backward solve the L^T */
354f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
355f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
356f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
357f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
358f1af5d2fSBarry Smith     idt  = 6*i;
359f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
360f1af5d2fSBarry Smith     s6 = x[5+idt];
361f1af5d2fSBarry Smith     while (nz--) {
362f1af5d2fSBarry Smith       idx   = 6*(*vi--);
363f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
364f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
365f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
366f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
367f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
368f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
369f1af5d2fSBarry Smith       v -= 36;
370f1af5d2fSBarry Smith     }
371f1af5d2fSBarry Smith   }
372f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
373f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
374b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
375f1af5d2fSBarry Smith   PetscFunctionReturn(0);
376f1af5d2fSBarry Smith }
377f1af5d2fSBarry Smith 
3784a2ae208SSatish Balay #undef __FUNCT__
3794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
3807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
381f1af5d2fSBarry Smith {
382f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
383f1af5d2fSBarry Smith   int             ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
384f1af5d2fSBarry Smith   int             *diag = a->diag,oidx;
385f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
38687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
38787828ca2SBarry Smith   PetscScalar     *x,*b;
388f1af5d2fSBarry Smith 
389f1af5d2fSBarry Smith   PetscFunctionBegin;
390*ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
391f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
392f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393f1af5d2fSBarry Smith 
394f1af5d2fSBarry Smith   /* forward solve the U^T */
395f1af5d2fSBarry Smith   idx = 0;
396f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
399f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
400*ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
401*ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
402f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
403f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
404f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
405f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
406f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
407f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
408f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
409f1af5d2fSBarry Smith     v += 49;
410f1af5d2fSBarry Smith 
411f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
412f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
413f1af5d2fSBarry Smith     while (nz--) {
414f1af5d2fSBarry Smith       oidx = 7*(*vi++);
415f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
416f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
417f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
418f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
419f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
420f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
421f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
422f1af5d2fSBarry Smith       v  += 49;
423f1af5d2fSBarry Smith     }
424f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
425f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
426f1af5d2fSBarry Smith     idx += 7;
427f1af5d2fSBarry Smith   }
428f1af5d2fSBarry Smith   /* backward solve the L^T */
429f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
430f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
431f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
432f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
433f1af5d2fSBarry Smith     idt  = 7*i;
434f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
435f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
436f1af5d2fSBarry Smith     while (nz--) {
437f1af5d2fSBarry Smith       idx   = 7*(*vi--);
438f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
439f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
440f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
441f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
442f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
443f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
444f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
445f1af5d2fSBarry Smith       v -= 49;
446f1af5d2fSBarry Smith     }
447f1af5d2fSBarry Smith   }
448f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
449f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
450b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
451f1af5d2fSBarry Smith   PetscFunctionReturn(0);
452f1af5d2fSBarry Smith }
453f1af5d2fSBarry Smith 
454f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4554a2ae208SSatish Balay #undef __FUNCT__
4564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
4577c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
458f1af5d2fSBarry Smith {
459f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
460f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
461f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
462f1af5d2fSBarry Smith   int             *diag = a->diag;
463f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
46487828ca2SBarry Smith   PetscScalar     s1,*x,*b,*t;
465f1af5d2fSBarry Smith 
466f1af5d2fSBarry Smith   PetscFunctionBegin;
467f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
468f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
469f1af5d2fSBarry Smith   t  = a->solve_work;
470f1af5d2fSBarry Smith 
471f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
472f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
473f1af5d2fSBarry Smith 
474f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
475f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
476f1af5d2fSBarry Smith     t[i] = b[c[i]];
477f1af5d2fSBarry Smith   }
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith   /* forward solve the U^T */
480f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith     v     = aa + diag[i];
483f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
484f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
485f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
486f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
487f1af5d2fSBarry Smith     while (nz--) {
488f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
489f1af5d2fSBarry Smith     }
490f1af5d2fSBarry Smith     t[i]   = s1;
491f1af5d2fSBarry Smith   }
492f1af5d2fSBarry Smith   /* backward solve the L^T */
493f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
494f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
495f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
496f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
497f1af5d2fSBarry Smith     s1   = t[i];
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith   }
502f1af5d2fSBarry Smith 
503f1af5d2fSBarry Smith   /* copy t into x according to permutation */
504f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
505f1af5d2fSBarry Smith     x[r[i]]   = t[i];
506f1af5d2fSBarry Smith   }
507f1af5d2fSBarry Smith 
508f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
509f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
510f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
512b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
513f1af5d2fSBarry Smith   PetscFunctionReturn(0);
514f1af5d2fSBarry Smith }
515f1af5d2fSBarry Smith 
5164a2ae208SSatish Balay #undef __FUNCT__
5174a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
5187c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
519f1af5d2fSBarry Smith {
520f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
521f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
522f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
523f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
524f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
52587828ca2SBarry Smith   PetscScalar     s1,s2,x1,x2;
52687828ca2SBarry Smith   PetscScalar     *x,*b,*t;
527f1af5d2fSBarry Smith 
528f1af5d2fSBarry Smith   PetscFunctionBegin;
529f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
530f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
531f1af5d2fSBarry Smith   t  = a->solve_work;
532f1af5d2fSBarry Smith 
533f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
534f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
535f1af5d2fSBarry Smith 
536f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
537f1af5d2fSBarry Smith   ii = 0;
538f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
539f1af5d2fSBarry Smith     ic      = 2*c[i];
540f1af5d2fSBarry Smith     t[ii]   = b[ic];
541f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
542f1af5d2fSBarry Smith     ii += 2;
543f1af5d2fSBarry Smith   }
544f1af5d2fSBarry Smith 
545f1af5d2fSBarry Smith   /* forward solve the U^T */
546f1af5d2fSBarry Smith   idx = 0;
547f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
550f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
551f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
552f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
553f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
554f1af5d2fSBarry Smith     v += 4;
555f1af5d2fSBarry Smith 
556f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
557f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
558f1af5d2fSBarry Smith     while (nz--) {
559f1af5d2fSBarry Smith       oidx = 2*(*vi++);
560f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
561f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
562f1af5d2fSBarry Smith       v  += 4;
563f1af5d2fSBarry Smith     }
564f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
565f1af5d2fSBarry Smith     idx += 2;
566f1af5d2fSBarry Smith   }
567f1af5d2fSBarry Smith   /* backward solve the L^T */
568f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
569f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
570f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
571f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
572f1af5d2fSBarry Smith     idt  = 2*i;
573f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
574f1af5d2fSBarry Smith     while (nz--) {
575f1af5d2fSBarry Smith       idx   = 2*(*vi--);
576f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
577f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
578f1af5d2fSBarry Smith       v -= 4;
579f1af5d2fSBarry Smith     }
580f1af5d2fSBarry Smith   }
581f1af5d2fSBarry Smith 
582f1af5d2fSBarry Smith   /* copy t into x according to permutation */
583f1af5d2fSBarry Smith   ii = 0;
584f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
585f1af5d2fSBarry Smith     ir      = 2*r[i];
586f1af5d2fSBarry Smith     x[ir]   = t[ii];
587f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
588f1af5d2fSBarry Smith     ii += 2;
589f1af5d2fSBarry Smith   }
590f1af5d2fSBarry Smith 
591f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
592f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
593f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
594f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
595b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
596f1af5d2fSBarry Smith   PetscFunctionReturn(0);
597f1af5d2fSBarry Smith }
598f1af5d2fSBarry Smith 
5994a2ae208SSatish Balay #undef __FUNCT__
6004a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
6017c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
602f1af5d2fSBarry Smith {
603f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
604f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
605f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
606f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
607f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
60887828ca2SBarry Smith   PetscScalar     s1,s2,s3,x1,x2,x3;
60987828ca2SBarry Smith   PetscScalar     *x,*b,*t;
610f1af5d2fSBarry Smith 
611f1af5d2fSBarry Smith   PetscFunctionBegin;
612f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
613f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
614f1af5d2fSBarry Smith   t  = a->solve_work;
615f1af5d2fSBarry Smith 
616f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
617f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
618f1af5d2fSBarry Smith 
619f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
620f1af5d2fSBarry Smith   ii = 0;
621f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
622f1af5d2fSBarry Smith     ic      = 3*c[i];
623f1af5d2fSBarry Smith     t[ii]   = b[ic];
624f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
625f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
626f1af5d2fSBarry Smith     ii += 3;
627f1af5d2fSBarry Smith   }
628f1af5d2fSBarry Smith 
629f1af5d2fSBarry Smith   /* forward solve the U^T */
630f1af5d2fSBarry Smith   idx = 0;
631f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
632f1af5d2fSBarry Smith 
633f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
634f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
635f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
636f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
637f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
638f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
639f1af5d2fSBarry Smith     v += 9;
640f1af5d2fSBarry Smith 
641f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
642f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
643f1af5d2fSBarry Smith     while (nz--) {
644f1af5d2fSBarry Smith       oidx = 3*(*vi++);
645f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
646f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
647f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
648f1af5d2fSBarry Smith       v  += 9;
649f1af5d2fSBarry Smith     }
650f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
651f1af5d2fSBarry Smith     idx += 3;
652f1af5d2fSBarry Smith   }
653f1af5d2fSBarry Smith   /* backward solve the L^T */
654f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
655f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
656f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
657f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
658f1af5d2fSBarry Smith     idt  = 3*i;
659f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
660f1af5d2fSBarry Smith     while (nz--) {
661f1af5d2fSBarry Smith       idx   = 3*(*vi--);
662f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
663f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
664f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
665f1af5d2fSBarry Smith       v -= 9;
666f1af5d2fSBarry Smith     }
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith 
669f1af5d2fSBarry Smith   /* copy t into x according to permutation */
670f1af5d2fSBarry Smith   ii = 0;
671f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
672f1af5d2fSBarry Smith     ir      = 3*r[i];
673f1af5d2fSBarry Smith     x[ir]   = t[ii];
674f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
675f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
676f1af5d2fSBarry Smith     ii += 3;
677f1af5d2fSBarry Smith   }
678f1af5d2fSBarry Smith 
679f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
680f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
681f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
682f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
683b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
684f1af5d2fSBarry Smith   PetscFunctionReturn(0);
685f1af5d2fSBarry Smith }
686f1af5d2fSBarry Smith 
6874a2ae208SSatish Balay #undef __FUNCT__
6884a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
6897c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
690f1af5d2fSBarry Smith {
691f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
692f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
693f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
694f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
695f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
69687828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
69787828ca2SBarry Smith   PetscScalar     *x,*b,*t;
698f1af5d2fSBarry Smith 
699f1af5d2fSBarry Smith   PetscFunctionBegin;
700f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
701f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
702f1af5d2fSBarry Smith   t  = a->solve_work;
703f1af5d2fSBarry Smith 
704f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
705f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
706f1af5d2fSBarry Smith 
707f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
708f1af5d2fSBarry Smith   ii = 0;
709f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
710f1af5d2fSBarry Smith     ic      = 4*c[i];
711f1af5d2fSBarry Smith     t[ii]   = b[ic];
712f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
713f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
714f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
715f1af5d2fSBarry Smith     ii += 4;
716f1af5d2fSBarry Smith   }
717f1af5d2fSBarry Smith 
718f1af5d2fSBarry Smith   /* forward solve the U^T */
719f1af5d2fSBarry Smith   idx = 0;
720f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
721f1af5d2fSBarry Smith 
722f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
723f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
724f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
725f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
726f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
727f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
728f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
729f1af5d2fSBarry Smith     v += 16;
730f1af5d2fSBarry Smith 
731f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
732f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
733f1af5d2fSBarry Smith     while (nz--) {
734f1af5d2fSBarry Smith       oidx = 4*(*vi++);
735f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
736f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
737f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
738f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
739f1af5d2fSBarry Smith       v  += 16;
740f1af5d2fSBarry Smith     }
741f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
742f1af5d2fSBarry Smith     idx += 4;
743f1af5d2fSBarry Smith   }
744f1af5d2fSBarry Smith   /* backward solve the L^T */
745f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
746f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
747f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
748f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
749f1af5d2fSBarry Smith     idt  = 4*i;
750f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
751f1af5d2fSBarry Smith     while (nz--) {
752f1af5d2fSBarry Smith       idx   = 4*(*vi--);
753f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
754f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
755f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
756f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
757f1af5d2fSBarry Smith       v -= 16;
758f1af5d2fSBarry Smith     }
759f1af5d2fSBarry Smith   }
760f1af5d2fSBarry Smith 
761f1af5d2fSBarry Smith   /* copy t into x according to permutation */
762f1af5d2fSBarry Smith   ii = 0;
763f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
764f1af5d2fSBarry Smith     ir      = 4*r[i];
765f1af5d2fSBarry Smith     x[ir]   = t[ii];
766f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
767f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
768f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
769f1af5d2fSBarry Smith     ii += 4;
770f1af5d2fSBarry Smith   }
771f1af5d2fSBarry Smith 
772f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
773f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
774f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
775f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
776b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
777f1af5d2fSBarry Smith   PetscFunctionReturn(0);
778f1af5d2fSBarry Smith }
779f1af5d2fSBarry Smith 
7804a2ae208SSatish Balay #undef __FUNCT__
7814a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
7827c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
783f1af5d2fSBarry Smith {
784f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
785f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
786f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
787f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
788f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
78987828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
79087828ca2SBarry Smith   PetscScalar     *x,*b,*t;
791f1af5d2fSBarry Smith 
792f1af5d2fSBarry Smith   PetscFunctionBegin;
793f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
795f1af5d2fSBarry Smith   t  = a->solve_work;
796f1af5d2fSBarry Smith 
797f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
798f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
799f1af5d2fSBarry Smith 
800f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
801f1af5d2fSBarry Smith   ii = 0;
802f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
803f1af5d2fSBarry Smith     ic      = 5*c[i];
804f1af5d2fSBarry Smith     t[ii]   = b[ic];
805f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
806f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
807f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
808f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
809f1af5d2fSBarry Smith     ii += 5;
810f1af5d2fSBarry Smith   }
811f1af5d2fSBarry Smith 
812f1af5d2fSBarry Smith   /* forward solve the U^T */
813f1af5d2fSBarry Smith   idx = 0;
814f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
817f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
818f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
819f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
820f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
821f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
822f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
823f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
824f1af5d2fSBarry Smith     v += 25;
825f1af5d2fSBarry Smith 
826f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
827f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
828f1af5d2fSBarry Smith     while (nz--) {
829f1af5d2fSBarry Smith       oidx = 5*(*vi++);
830f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
831f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
832f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
833f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
834f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
835f1af5d2fSBarry Smith       v  += 25;
836f1af5d2fSBarry Smith     }
837f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
838f1af5d2fSBarry Smith     idx += 5;
839f1af5d2fSBarry Smith   }
840f1af5d2fSBarry Smith   /* backward solve the L^T */
841f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
842f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
843f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
844f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
845f1af5d2fSBarry Smith     idt  = 5*i;
846f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       idx   = 5*(*vi--);
849f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v -= 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith   }
857f1af5d2fSBarry Smith 
858f1af5d2fSBarry Smith   /* copy t into x according to permutation */
859f1af5d2fSBarry Smith   ii = 0;
860f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
861f1af5d2fSBarry Smith     ir      = 5*r[i];
862f1af5d2fSBarry Smith     x[ir]   = t[ii];
863f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
864f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
865f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
866f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
867f1af5d2fSBarry Smith     ii += 5;
868f1af5d2fSBarry Smith   }
869f1af5d2fSBarry Smith 
870f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
871f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
872f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
873f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
874b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
875f1af5d2fSBarry Smith   PetscFunctionReturn(0);
876f1af5d2fSBarry Smith }
877f1af5d2fSBarry Smith 
8784a2ae208SSatish Balay #undef __FUNCT__
8794a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
8807c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
881f1af5d2fSBarry Smith {
882f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
883f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
884f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
885f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
886f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
88787828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
88887828ca2SBarry Smith   PetscScalar     *x,*b,*t;
889f1af5d2fSBarry Smith 
890f1af5d2fSBarry Smith   PetscFunctionBegin;
891f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
892f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
893f1af5d2fSBarry Smith   t  = a->solve_work;
894f1af5d2fSBarry Smith 
895f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
896f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
897f1af5d2fSBarry Smith 
898f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
899f1af5d2fSBarry Smith   ii = 0;
900f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
901f1af5d2fSBarry Smith     ic      = 6*c[i];
902f1af5d2fSBarry Smith     t[ii]   = b[ic];
903f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
904f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
905f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
906f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
907f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
908f1af5d2fSBarry Smith     ii += 6;
909f1af5d2fSBarry Smith   }
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   /* forward solve the U^T */
912f1af5d2fSBarry Smith   idx = 0;
913f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
914f1af5d2fSBarry Smith 
915f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
916f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
917f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
918f1af5d2fSBarry Smith     x6    = t[5+idx];
919f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
920f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
921f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
922f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
923f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
924f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
925f1af5d2fSBarry Smith     v += 36;
926f1af5d2fSBarry Smith 
927f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
928f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
929f1af5d2fSBarry Smith     while (nz--) {
930f1af5d2fSBarry Smith       oidx = 6*(*vi++);
931f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
932f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
933f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
934f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
935f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
936f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
937f1af5d2fSBarry Smith       v  += 36;
938f1af5d2fSBarry Smith     }
939f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
940f1af5d2fSBarry Smith     t[5+idx] = s6;
941f1af5d2fSBarry Smith     idx += 6;
942f1af5d2fSBarry Smith   }
943f1af5d2fSBarry Smith   /* backward solve the L^T */
944f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
945f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
946f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
947f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
948f1af5d2fSBarry Smith     idt  = 6*i;
949f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
950f1af5d2fSBarry Smith     s6 = t[5+idt];
951f1af5d2fSBarry Smith     while (nz--) {
952f1af5d2fSBarry Smith       idx   = 6*(*vi--);
953f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
954f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
955f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
956f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
957f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
958f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
959f1af5d2fSBarry Smith       v -= 36;
960f1af5d2fSBarry Smith     }
961f1af5d2fSBarry Smith   }
962f1af5d2fSBarry Smith 
963f1af5d2fSBarry Smith   /* copy t into x according to permutation */
964f1af5d2fSBarry Smith   ii = 0;
965f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
966f1af5d2fSBarry Smith     ir      = 6*r[i];
967f1af5d2fSBarry Smith     x[ir]   = t[ii];
968f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
969f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
970f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
971f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
972f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
973f1af5d2fSBarry Smith     ii += 6;
974f1af5d2fSBarry Smith   }
975f1af5d2fSBarry Smith 
976f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
977f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
978f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
979f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
980b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
981f1af5d2fSBarry Smith   PetscFunctionReturn(0);
982f1af5d2fSBarry Smith }
983f1af5d2fSBarry Smith 
9844a2ae208SSatish Balay #undef __FUNCT__
9854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
9867c922b88SBarry Smith int MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
987f1af5d2fSBarry Smith {
988f1af5d2fSBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
989f1af5d2fSBarry Smith   IS              iscol=a->col,isrow=a->row;
990f1af5d2fSBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,*rout,*cout;
991f1af5d2fSBarry Smith   int             *diag = a->diag,ii,ic,ir,oidx;
992f1af5d2fSBarry Smith   MatScalar       *aa=a->a,*v;
99387828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
99487828ca2SBarry Smith   PetscScalar     *x,*b,*t;
995f1af5d2fSBarry Smith 
996f1af5d2fSBarry Smith   PetscFunctionBegin;
997f1af5d2fSBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
999f1af5d2fSBarry Smith   t  = a->solve_work;
1000f1af5d2fSBarry Smith 
1001f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1002f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1003f1af5d2fSBarry Smith 
1004f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1005f1af5d2fSBarry Smith   ii = 0;
1006f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1007f1af5d2fSBarry Smith     ic      = 7*c[i];
1008f1af5d2fSBarry Smith     t[ii]   = b[ic];
1009f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1010f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1011f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1012f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1013f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1014f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1015f1af5d2fSBarry Smith     ii += 7;
1016f1af5d2fSBarry Smith   }
1017f1af5d2fSBarry Smith 
1018f1af5d2fSBarry Smith   /* forward solve the U^T */
1019f1af5d2fSBarry Smith   idx = 0;
1020f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1021f1af5d2fSBarry Smith 
1022f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1023f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1024f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1025f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1026f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1027f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1028f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1029f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1030f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1031f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1032f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1033f1af5d2fSBarry Smith     v += 49;
1034f1af5d2fSBarry Smith 
1035f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1036f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1037f1af5d2fSBarry Smith     while (nz--) {
1038f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1039f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1040f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1041f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1042f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1043f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1044f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1045f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1046f1af5d2fSBarry Smith       v  += 49;
1047f1af5d2fSBarry Smith     }
1048f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1049f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1050f1af5d2fSBarry Smith     idx += 7;
1051f1af5d2fSBarry Smith   }
1052f1af5d2fSBarry Smith   /* backward solve the L^T */
1053f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1054f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1055f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1056f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1057f1af5d2fSBarry Smith     idt  = 7*i;
1058f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1059f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1062f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v -= 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith   }
1072f1af5d2fSBarry Smith 
1073f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1074f1af5d2fSBarry Smith   ii = 0;
1075f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1076f1af5d2fSBarry Smith     ir      = 7*r[i];
1077f1af5d2fSBarry Smith     x[ir]   = t[ii];
1078f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1079f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1080f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1081f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1082f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1083f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1084f1af5d2fSBarry Smith     ii += 7;
1085f1af5d2fSBarry Smith   }
1086f1af5d2fSBarry Smith 
1087f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1088f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1089f1af5d2fSBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1090f1af5d2fSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1091b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
1092f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1093f1af5d2fSBarry Smith }
1094f1af5d2fSBarry Smith 
10954e2b4712SSatish Balay /* ----------------------------------------------------------- */
10964a2ae208SSatish Balay #undef __FUNCT__
10974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
10984e2b4712SSatish Balay int MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
10994e2b4712SSatish Balay {
11004e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11014e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11024e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
11034e2b4712SSatish Balay   int             nz,bs=a->bs,bs2=a->bs2,*rout,*cout;
11043f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
110587828ca2SBarry Smith   PetscScalar     *x,*b,*s,*t,*ls;
11064e2b4712SSatish Balay 
11074e2b4712SSatish Balay   PetscFunctionBegin;
1108e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1109e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1110f1af5d2fSBarry Smith   t  = a->solve_work;
11114e2b4712SSatish Balay 
11124e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11134e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11144e2b4712SSatish Balay 
11154e2b4712SSatish Balay   /* forward solve the lower triangular */
111687828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11174e2b4712SSatish Balay   for (i=1; i<n; i++) {
11184e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11194e2b4712SSatish Balay     vi  = aj + ai[i];
11204e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1121f1af5d2fSBarry Smith     s = t + bs*i;
112287828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11234e2b4712SSatish Balay     while (nz--) {
1124f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11254e2b4712SSatish Balay       v += bs2;
11264e2b4712SSatish Balay     }
11274e2b4712SSatish Balay   }
11284e2b4712SSatish Balay   /* backward solve the upper triangular */
1129273d9f13SBarry Smith   ls = a->solve_work + A->n;
11304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11314e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11324e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11334e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
113487828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11354e2b4712SSatish Balay     while (nz--) {
1136f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11374e2b4712SSatish Balay       v += bs2;
11384e2b4712SSatish Balay     }
1139f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
114087828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11414e2b4712SSatish Balay   }
11424e2b4712SSatish Balay 
11434e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11444e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1145e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1146e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1147b0a32e0cSBarry Smith   PetscLogFlops(2*(a->bs2)*(a->nz) - a->bs*A->n);
11484e2b4712SSatish Balay   PetscFunctionReturn(0);
11494e2b4712SSatish Balay }
11504e2b4712SSatish Balay 
11514a2ae208SSatish Balay #undef __FUNCT__
11524a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
11534e2b4712SSatish Balay int MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11544e2b4712SSatish Balay {
11554e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
11564e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
11574e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
11584e2b4712SSatish Balay   int             *diag = a->diag;
11593f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
116087828ca2SBarry Smith   PetscScalar     s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
116187828ca2SBarry Smith   PetscScalar     *x,*b,*t;
11624e2b4712SSatish Balay 
11634e2b4712SSatish Balay   PetscFunctionBegin;
1164e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1165e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1166f1af5d2fSBarry Smith   t  = a->solve_work;
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11694e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11704e2b4712SSatish Balay 
11714e2b4712SSatish Balay   /* forward solve the lower triangular */
11724e2b4712SSatish Balay   idx    = 7*(*r++);
1173f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1174f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1175f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11764e2b4712SSatish Balay 
11774e2b4712SSatish Balay   for (i=1; i<n; i++) {
11784e2b4712SSatish Balay     v     = aa + 49*ai[i];
11794e2b4712SSatish Balay     vi    = aj + ai[i];
11804e2b4712SSatish Balay     nz    = diag[i] - ai[i];
11814e2b4712SSatish Balay     idx   = 7*(*r++);
1182f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1183f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
11844e2b4712SSatish Balay     while (nz--) {
11854e2b4712SSatish Balay       idx   = 7*(*vi++);
1186f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1187f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1188f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1189f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1190f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1191f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1192f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1193f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1194f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1195f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
11964e2b4712SSatish Balay       v += 49;
11974e2b4712SSatish Balay     }
11984e2b4712SSatish Balay     idx = 7*i;
1199f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1200f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1201f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12024e2b4712SSatish Balay   }
12034e2b4712SSatish Balay   /* backward solve the upper triangular */
12044e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12054e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12064e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12074e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12084e2b4712SSatish Balay     idt  = 7*i;
1209f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1210f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1211f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12124e2b4712SSatish Balay     while (nz--) {
12134e2b4712SSatish Balay       idx   = 7*(*vi++);
1214f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1215f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1216f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1217f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1218f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1219f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1220f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1221f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1222f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1223f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12244e2b4712SSatish Balay       v += 49;
12254e2b4712SSatish Balay     }
12264e2b4712SSatish Balay     idc = 7*(*c--);
12274e2b4712SSatish Balay     v   = aa + 49*diag[i];
1228f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1229f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1230f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1231f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1232f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1233f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1234f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1235f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1236f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1237f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1238f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1239f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1240f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1241f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12424e2b4712SSatish Balay   }
12434e2b4712SSatish Balay 
12444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1246e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1247e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1248b0a32e0cSBarry Smith   PetscLogFlops(2*49*(a->nz) - 7*A->n);
12494e2b4712SSatish Balay   PetscFunctionReturn(0);
12504e2b4712SSatish Balay }
12514e2b4712SSatish Balay 
12524a2ae208SSatish Balay #undef __FUNCT__
12534a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
125415091d37SBarry Smith int MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
125515091d37SBarry Smith {
125615091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
125715091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
125815091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
125915091d37SBarry Smith   MatScalar       *aa=a->a,*v;
126087828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
126115091d37SBarry Smith 
126215091d37SBarry Smith   PetscFunctionBegin;
126315091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
126415091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
126515091d37SBarry Smith   /* forward solve the lower triangular */
126615091d37SBarry Smith   idx    = 0;
126715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
126815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
126915091d37SBarry Smith   x[6] = b[6+idx];
127015091d37SBarry Smith   for (i=1; i<n; i++) {
127115091d37SBarry Smith     v     =  aa + 49*ai[i];
127215091d37SBarry Smith     vi    =  aj + ai[i];
127315091d37SBarry Smith     nz    =  diag[i] - ai[i];
127415091d37SBarry Smith     idx   =  7*i;
1275f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1276f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1277f1af5d2fSBarry Smith     s7  =  b[6+idx];
127815091d37SBarry Smith     while (nz--) {
127915091d37SBarry Smith       jdx   = 7*(*vi++);
128015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
128115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
128215091d37SBarry Smith       x7    = x[6+jdx];
1283f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1284f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1285f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1286f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1287f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1288f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1289f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
129015091d37SBarry Smith       v += 49;
129115091d37SBarry Smith      }
1292f1af5d2fSBarry Smith     x[idx]   = s1;
1293f1af5d2fSBarry Smith     x[1+idx] = s2;
1294f1af5d2fSBarry Smith     x[2+idx] = s3;
1295f1af5d2fSBarry Smith     x[3+idx] = s4;
1296f1af5d2fSBarry Smith     x[4+idx] = s5;
1297f1af5d2fSBarry Smith     x[5+idx] = s6;
1298f1af5d2fSBarry Smith     x[6+idx] = s7;
129915091d37SBarry Smith   }
130015091d37SBarry Smith   /* backward solve the upper triangular */
130115091d37SBarry Smith   for (i=n-1; i>=0; i--){
130215091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
130315091d37SBarry Smith     vi   = aj + diag[i] + 1;
130415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
130515091d37SBarry Smith     idt  = 7*i;
1306f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1307f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1308f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1309f1af5d2fSBarry Smith     s7 = x[6+idt];
131015091d37SBarry Smith     while (nz--) {
131115091d37SBarry Smith       idx   = 7*(*vi++);
131215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
131315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
131415091d37SBarry Smith       x7    = x[6+idx];
1315f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1316f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1317f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1318f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1319f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1320f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1321f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
132215091d37SBarry Smith       v += 49;
132315091d37SBarry Smith     }
132415091d37SBarry Smith     v        = aa + 49*diag[i];
1325f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1326f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1327f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1328f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1329f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1330f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1331f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1332f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1333f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1334f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1335f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1336f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1337f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1338f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
133915091d37SBarry Smith   }
134015091d37SBarry Smith 
134115091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
134215091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1343b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
134415091d37SBarry Smith   PetscFunctionReturn(0);
134515091d37SBarry Smith }
134615091d37SBarry Smith 
13474a2ae208SSatish Balay #undef __FUNCT__
13484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
134915091d37SBarry Smith int MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
135015091d37SBarry Smith {
135115091d37SBarry Smith   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
135215091d37SBarry Smith   IS              iscol=a->col,isrow=a->row;
135315091d37SBarry Smith   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
135415091d37SBarry Smith   int             *diag = a->diag;
135515091d37SBarry Smith   MatScalar       *aa=a->a,*v;
135687828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
135715091d37SBarry Smith 
135815091d37SBarry Smith   PetscFunctionBegin;
135915091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
136015091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1361f1af5d2fSBarry Smith   t  = a->solve_work;
136215091d37SBarry Smith 
136315091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
136415091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
136515091d37SBarry Smith 
136615091d37SBarry Smith   /* forward solve the lower triangular */
136715091d37SBarry Smith   idx    = 6*(*r++);
1368f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1369f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1370f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
137115091d37SBarry Smith   for (i=1; i<n; i++) {
137215091d37SBarry Smith     v     = aa + 36*ai[i];
137315091d37SBarry Smith     vi    = aj + ai[i];
137415091d37SBarry Smith     nz    = diag[i] - ai[i];
137515091d37SBarry Smith     idx   = 6*(*r++);
1376f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1377f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
137815091d37SBarry Smith     while (nz--) {
137915091d37SBarry Smith       idx   = 6*(*vi++);
1380f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1381f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1382f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1383f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1384f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1385f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1386f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1387f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
138815091d37SBarry Smith       v += 36;
138915091d37SBarry Smith     }
139015091d37SBarry Smith     idx = 6*i;
1391f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1392f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1393f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
139415091d37SBarry Smith   }
139515091d37SBarry Smith   /* backward solve the upper triangular */
139615091d37SBarry Smith   for (i=n-1; i>=0; i--){
139715091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
139815091d37SBarry Smith     vi   = aj + diag[i] + 1;
139915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
140015091d37SBarry Smith     idt  = 6*i;
1401f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1402f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1403f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
140415091d37SBarry Smith     while (nz--) {
140515091d37SBarry Smith       idx   = 6*(*vi++);
1406f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1407f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1408f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1409f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
141515091d37SBarry Smith       v += 36;
141615091d37SBarry Smith     }
141715091d37SBarry Smith     idc = 6*(*c--);
141815091d37SBarry Smith     v   = aa + 36*diag[i];
1419f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1420f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1421f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1422f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1423f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1424f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1425f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1426f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1427f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1428f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1429f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1430f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
143115091d37SBarry Smith   }
143215091d37SBarry Smith 
143315091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
143415091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
143515091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
143615091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1437b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
143815091d37SBarry Smith   PetscFunctionReturn(0);
143915091d37SBarry Smith }
144015091d37SBarry Smith 
14414a2ae208SSatish Balay #undef __FUNCT__
14424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
144315091d37SBarry Smith int MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
144415091d37SBarry Smith {
144515091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
144615091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
144715091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
144815091d37SBarry Smith   MatScalar       *aa=a->a,*v;
144987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
145015091d37SBarry Smith 
145115091d37SBarry Smith   PetscFunctionBegin;
145215091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
145315091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
145415091d37SBarry Smith   /* forward solve the lower triangular */
145515091d37SBarry Smith   idx    = 0;
145615091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
145715091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
145815091d37SBarry Smith   for (i=1; i<n; i++) {
145915091d37SBarry Smith     v     =  aa + 36*ai[i];
146015091d37SBarry Smith     vi    =  aj + ai[i];
146115091d37SBarry Smith     nz    =  diag[i] - ai[i];
146215091d37SBarry Smith     idx   =  6*i;
1463f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1464f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
146515091d37SBarry Smith     while (nz--) {
146615091d37SBarry Smith       jdx   = 6*(*vi++);
146715091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
146815091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1469f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1470f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1471f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1472f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1473f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1474f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
147515091d37SBarry Smith       v += 36;
147615091d37SBarry Smith      }
1477f1af5d2fSBarry Smith     x[idx]   = s1;
1478f1af5d2fSBarry Smith     x[1+idx] = s2;
1479f1af5d2fSBarry Smith     x[2+idx] = s3;
1480f1af5d2fSBarry Smith     x[3+idx] = s4;
1481f1af5d2fSBarry Smith     x[4+idx] = s5;
1482f1af5d2fSBarry Smith     x[5+idx] = s6;
148315091d37SBarry Smith   }
148415091d37SBarry Smith   /* backward solve the upper triangular */
148515091d37SBarry Smith   for (i=n-1; i>=0; i--){
148615091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
148715091d37SBarry Smith     vi   = aj + diag[i] + 1;
148815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
148915091d37SBarry Smith     idt  = 6*i;
1490f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1491f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1492f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
149315091d37SBarry Smith     while (nz--) {
149415091d37SBarry Smith       idx   = 6*(*vi++);
149515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
149615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1497f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150315091d37SBarry Smith       v += 36;
150415091d37SBarry Smith     }
150515091d37SBarry Smith     v        = aa + 36*diag[i];
1506f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1507f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1508f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1509f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1510f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1511f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
151215091d37SBarry Smith   }
151315091d37SBarry Smith 
151415091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
151515091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1516b0a32e0cSBarry Smith   PetscLogFlops(2*36*(a->nz) - 6*A->n);
151715091d37SBarry Smith   PetscFunctionReturn(0);
151815091d37SBarry Smith }
151915091d37SBarry Smith 
15204a2ae208SSatish Balay #undef __FUNCT__
15214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
15224e2b4712SSatish Balay int MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15234e2b4712SSatish Balay {
15244e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
15254e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
15264e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
15274e2b4712SSatish Balay   int             *diag = a->diag;
15283f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
152987828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
15304e2b4712SSatish Balay 
15314e2b4712SSatish Balay   PetscFunctionBegin;
1532e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1533e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1534f1af5d2fSBarry Smith   t  = a->solve_work;
15354e2b4712SSatish Balay 
15364e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15374e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15384e2b4712SSatish Balay 
15394e2b4712SSatish Balay   /* forward solve the lower triangular */
15404e2b4712SSatish Balay   idx    = 5*(*r++);
1541f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1542f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15434e2b4712SSatish Balay   for (i=1; i<n; i++) {
15444e2b4712SSatish Balay     v     = aa + 25*ai[i];
15454e2b4712SSatish Balay     vi    = aj + ai[i];
15464e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15474e2b4712SSatish Balay     idx   = 5*(*r++);
1548f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1549f1af5d2fSBarry Smith     s5  = b[4+idx];
15504e2b4712SSatish Balay     while (nz--) {
15514e2b4712SSatish Balay       idx   = 5*(*vi++);
1552f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1553f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1554f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1555f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1556f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1557f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1558f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15594e2b4712SSatish Balay       v += 25;
15604e2b4712SSatish Balay     }
15614e2b4712SSatish Balay     idx = 5*i;
1562f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1563f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15644e2b4712SSatish Balay   }
15654e2b4712SSatish Balay   /* backward solve the upper triangular */
15664e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15674e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15684e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15694e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
15704e2b4712SSatish Balay     idt  = 5*i;
1571f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1572f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
15734e2b4712SSatish Balay     while (nz--) {
15744e2b4712SSatish Balay       idx   = 5*(*vi++);
1575f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1576f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1577f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1578f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1579f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1580f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1581f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15824e2b4712SSatish Balay       v += 25;
15834e2b4712SSatish Balay     }
15844e2b4712SSatish Balay     idc = 5*(*c--);
15854e2b4712SSatish Balay     v   = aa + 25*diag[i];
1586f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1587f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1588f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1589f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1590f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1591f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1592f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1593f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1594f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1595f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
15964e2b4712SSatish Balay   }
15974e2b4712SSatish Balay 
15984e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
15994e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1600e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1601e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1602b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
16034e2b4712SSatish Balay   PetscFunctionReturn(0);
16044e2b4712SSatish Balay }
16054e2b4712SSatish Balay 
16064a2ae208SSatish Balay #undef __FUNCT__
16074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
160815091d37SBarry Smith int MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
160915091d37SBarry Smith {
161015091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
161115091d37SBarry Smith   int             i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
161215091d37SBarry Smith   int             ierr,*diag = a->diag,jdx;
161315091d37SBarry Smith   MatScalar       *aa=a->a,*v;
161487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
161515091d37SBarry Smith 
161615091d37SBarry Smith   PetscFunctionBegin;
161715091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
161815091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
161915091d37SBarry Smith   /* forward solve the lower triangular */
162015091d37SBarry Smith   idx    = 0;
162115091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
162215091d37SBarry Smith   for (i=1; i<n; i++) {
162315091d37SBarry Smith     v     =  aa + 25*ai[i];
162415091d37SBarry Smith     vi    =  aj + ai[i];
162515091d37SBarry Smith     nz    =  diag[i] - ai[i];
162615091d37SBarry Smith     idx   =  5*i;
1627f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
162815091d37SBarry Smith     while (nz--) {
162915091d37SBarry Smith       jdx   = 5*(*vi++);
163015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1631f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1632f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1633f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1634f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1635f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
163615091d37SBarry Smith       v    += 25;
163715091d37SBarry Smith     }
1638f1af5d2fSBarry Smith     x[idx]   = s1;
1639f1af5d2fSBarry Smith     x[1+idx] = s2;
1640f1af5d2fSBarry Smith     x[2+idx] = s3;
1641f1af5d2fSBarry Smith     x[3+idx] = s4;
1642f1af5d2fSBarry Smith     x[4+idx] = s5;
164315091d37SBarry Smith   }
164415091d37SBarry Smith   /* backward solve the upper triangular */
164515091d37SBarry Smith   for (i=n-1; i>=0; i--){
164615091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
164715091d37SBarry Smith     vi   = aj + diag[i] + 1;
164815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
164915091d37SBarry Smith     idt  = 5*i;
1650f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1651f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
165215091d37SBarry Smith     while (nz--) {
165315091d37SBarry Smith       idx   = 5*(*vi++);
165415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1655f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1656f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1657f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1658f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1659f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
166015091d37SBarry Smith       v    += 25;
166115091d37SBarry Smith     }
166215091d37SBarry Smith     v        = aa + 25*diag[i];
1663f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1664f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1665f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1666f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1667f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
166815091d37SBarry Smith   }
166915091d37SBarry Smith 
167015091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
167115091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1672b0a32e0cSBarry Smith   PetscLogFlops(2*25*(a->nz) - 5*A->n);
167315091d37SBarry Smith   PetscFunctionReturn(0);
167415091d37SBarry Smith }
167515091d37SBarry Smith 
16764a2ae208SSatish Balay #undef __FUNCT__
16774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
16784e2b4712SSatish Balay int MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
16794e2b4712SSatish Balay {
16804e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
16814e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
16824e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
16834e2b4712SSatish Balay   int             *diag = a->diag;
16843f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
168587828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,s4,x1,x2,x3,x4,*t;
16864e2b4712SSatish Balay 
16874e2b4712SSatish Balay   PetscFunctionBegin;
1688e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1689e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1690f1af5d2fSBarry Smith   t  = a->solve_work;
16914e2b4712SSatish Balay 
16924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
16934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
16944e2b4712SSatish Balay 
16954e2b4712SSatish Balay   /* forward solve the lower triangular */
16964e2b4712SSatish Balay   idx    = 4*(*r++);
1697f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1698f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
16994e2b4712SSatish Balay   for (i=1; i<n; i++) {
17004e2b4712SSatish Balay     v     = aa + 16*ai[i];
17014e2b4712SSatish Balay     vi    = aj + ai[i];
17024e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17034e2b4712SSatish Balay     idx   = 4*(*r++);
1704f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17054e2b4712SSatish Balay     while (nz--) {
17064e2b4712SSatish Balay       idx   = 4*(*vi++);
1707f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1708f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1709f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1710f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1711f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17124e2b4712SSatish Balay       v    += 16;
17134e2b4712SSatish Balay     }
17144e2b4712SSatish Balay     idx        = 4*i;
1715f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1716f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17174e2b4712SSatish Balay   }
17184e2b4712SSatish Balay   /* backward solve the upper triangular */
17194e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17204e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17214e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17224e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17234e2b4712SSatish Balay     idt  = 4*i;
1724f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1725f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17264e2b4712SSatish Balay     while (nz--) {
17274e2b4712SSatish Balay       idx   = 4*(*vi++);
1728f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1729f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1730f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1731f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1732f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1733f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17344e2b4712SSatish Balay       v += 16;
17354e2b4712SSatish Balay     }
17364e2b4712SSatish Balay     idc      = 4*(*c--);
17374e2b4712SSatish Balay     v        = aa + 16*diag[i];
1738f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1739f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1740f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1741f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17424e2b4712SSatish Balay   }
17434e2b4712SSatish Balay 
17444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1746e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1747e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1748b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
17494e2b4712SSatish Balay   PetscFunctionReturn(0);
17504e2b4712SSatish Balay }
175124c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
175224c233c2SKris Buschelman 
175324c233c2SKris Buschelman #include PETSC_HAVE_SSE
175424c233c2SKris Buschelman 
175524c233c2SKris Buschelman #undef __FUNCT__
175624c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
175724c233c2SKris Buschelman int MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
175824c233c2SKris Buschelman {
175924c233c2SKris Buschelman   /*
176024c233c2SKris Buschelman      Note: This code uses demotion of double
176124c233c2SKris Buschelman      to float when performing the mixed-mode computation.
176224c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
176324c233c2SKris Buschelman   */
176424c233c2SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
176524c233c2SKris Buschelman   IS              iscol=a->col,isrow=a->row;
176624c233c2SKris Buschelman   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
176724c233c2SKris Buschelman   int             *diag = a->diag,ai16;
176824c233c2SKris Buschelman   MatScalar       *aa=a->a,*v;
176987828ca2SBarry Smith   PetscScalar     *x,*b,*t;
177024c233c2SKris Buschelman 
177124c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
177224c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
177324c233c2SKris Buschelman   unsigned long   offset;
177424c233c2SKris Buschelman 
177524c233c2SKris Buschelman   PetscFunctionBegin;
177624c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
177724c233c2SKris Buschelman 
177824c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
177924c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
178024c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
178124c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
178224c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
178324c233c2SKris Buschelman 
178424c233c2SKris Buschelman     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
178524c233c2SKris Buschelman     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
178624c233c2SKris Buschelman     t  = a->solve_work;
178724c233c2SKris Buschelman 
178824c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
178924c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
179024c233c2SKris Buschelman 
179124c233c2SKris Buschelman     /* forward solve the lower triangular */
179224c233c2SKris Buschelman     idx  = 4*(*r++);
179324c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
179424c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
179524c233c2SKris Buschelman     v    =  aa + 16*ai[1];
179624c233c2SKris Buschelman 
179724c233c2SKris Buschelman     for (i=1; i<n;) {
179824c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
179924c233c2SKris Buschelman       vi   =  aj      + ai[i];
180024c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
180124c233c2SKris Buschelman       idx  =  4*(*r++);
180224c233c2SKris Buschelman 
180324c233c2SKris Buschelman       /* Demote sum from double to float */
180424c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
180524c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
180624c233c2SKris Buschelman 
180724c233c2SKris Buschelman       while (nz--) {
180824c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
180924c233c2SKris Buschelman         idx = 4*(*vi++);
181024c233c2SKris Buschelman 
181124c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
181224c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
181324c233c2SKris Buschelman 
181424c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
181524c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
181624c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
181724c233c2SKris Buschelman 
181824c233c2SKris Buschelman           /* First Column */
181924c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
182024c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
182124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
182224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
182324c233c2SKris Buschelman 
182424c233c2SKris Buschelman           /* Second Column */
182524c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
182624c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
182724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
182824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
182924c233c2SKris Buschelman 
183024c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
183124c233c2SKris Buschelman 
183224c233c2SKris Buschelman           /* Third Column */
183324c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
183424c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
183524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
183624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
183724c233c2SKris Buschelman 
183824c233c2SKris Buschelman           /* Fourth Column */
183924c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
184024c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
184124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
184224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
184324c233c2SKris Buschelman         SSE_INLINE_END_2
184424c233c2SKris Buschelman 
184524c233c2SKris Buschelman         v  += 16;
184624c233c2SKris Buschelman       }
184724c233c2SKris Buschelman       idx = 4*i;
184824c233c2SKris Buschelman       v   = aa + 16*ai[++i];
184924c233c2SKris Buschelman       PREFETCH_NTA(v);
185024c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
185124c233c2SKris Buschelman 
185224c233c2SKris Buschelman       /* Promote result from float to double */
185324c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
185424c233c2SKris Buschelman     }
185524c233c2SKris Buschelman     /* backward solve the upper triangular */
185624c233c2SKris Buschelman     idt  = 4*(n-1);
185724c233c2SKris Buschelman     ai16 = 16*diag[n-1];
185824c233c2SKris Buschelman     v    = aa + ai16 + 16;
185924c233c2SKris Buschelman     for (i=n-1; i>=0;){
186024c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
186124c233c2SKris Buschelman       vi = aj + diag[i] + 1;
186224c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
186324c233c2SKris Buschelman 
186424c233c2SKris Buschelman       /* Demote accumulator from double to float */
186524c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
186624c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
186724c233c2SKris Buschelman 
186824c233c2SKris Buschelman       while (nz--) {
186924c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
187024c233c2SKris Buschelman         idx = 4*(*vi++);
187124c233c2SKris Buschelman 
187224c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
187324c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
187424c233c2SKris Buschelman 
187524c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
187624c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
187724c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
187824c233c2SKris Buschelman 
187924c233c2SKris Buschelman           /* First Column */
188024c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
188124c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
188224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
188324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
188424c233c2SKris Buschelman 
188524c233c2SKris Buschelman           /* Second Column */
188624c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
188724c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
188824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
188924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
189024c233c2SKris Buschelman 
189124c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
189224c233c2SKris Buschelman 
189324c233c2SKris Buschelman           /* Third Column */
189424c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
189524c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
189624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
189724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
189824c233c2SKris Buschelman 
189924c233c2SKris Buschelman           /* Fourth Column */
190024c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
190124c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
190224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
190324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
190424c233c2SKris Buschelman         SSE_INLINE_END_2
190524c233c2SKris Buschelman         v  += 16;
190624c233c2SKris Buschelman       }
190724c233c2SKris Buschelman       v    = aa + ai16;
190824c233c2SKris Buschelman       ai16 = 16*diag[--i];
190924c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
191024c233c2SKris Buschelman       /*
191124c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
191224c233c2SKris Buschelman          which was inverted as part of the factorization
191324c233c2SKris Buschelman       */
191424c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
191524c233c2SKris Buschelman         /* First Column */
191624c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
191724c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
191824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
191924c233c2SKris Buschelman 
192024c233c2SKris Buschelman         /* Second Column */
192124c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
192224c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
192324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
192424c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
192524c233c2SKris Buschelman 
192624c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
192724c233c2SKris Buschelman 
192824c233c2SKris Buschelman         /* Third Column */
192924c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
193024c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
193124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
193224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
193324c233c2SKris Buschelman 
193424c233c2SKris Buschelman         /* Fourth Column */
193524c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
193624c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
193724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
193824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
193924c233c2SKris Buschelman 
194024c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
194124c233c2SKris Buschelman       SSE_INLINE_END_3
194224c233c2SKris Buschelman 
194324c233c2SKris Buschelman       /* Promote solution from float to double */
194424c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
194524c233c2SKris Buschelman 
194624c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
194724c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
194824c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
194924c233c2SKris Buschelman       idc  = 4*(*c--);
195024c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
195124c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
195224c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
195324c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
195424c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
195524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
195624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
195724c233c2SKris Buschelman       SSE_INLINE_END_2
195824c233c2SKris Buschelman       v    = aa + ai16 + 16;
195924c233c2SKris Buschelman       idt -= 4;
196024c233c2SKris Buschelman     }
196124c233c2SKris Buschelman 
196224c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
196324c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
196424c233c2SKris Buschelman     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
196524c233c2SKris Buschelman     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
196624c233c2SKris Buschelman     PetscLogFlops(2*16*(a->nz) - 4*A->n);
196724c233c2SKris Buschelman   SSE_SCOPE_END;
196824c233c2SKris Buschelman   PetscFunctionReturn(0);
196924c233c2SKris Buschelman }
197024c233c2SKris Buschelman 
197124c233c2SKris Buschelman #endif
19720ef38995SBarry Smith 
19730ef38995SBarry Smith 
19744e2b4712SSatish Balay /*
19754e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
19764e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
19774e2b4712SSatish Balay */
19784a2ae208SSatish Balay #undef __FUNCT__
19794a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
19804e2b4712SSatish Balay int MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
19814e2b4712SSatish Balay {
19824e2b4712SSatish Balay   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
198330d4dcafSBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
198430d4dcafSBarry Smith   int             ierr,*diag = a->diag;
19853f1db9ecSBarry Smith   MatScalar       *aa=a->a;
198687828ca2SBarry Smith   PetscScalar     *x,*b;
19874e2b4712SSatish Balay 
19884e2b4712SSatish Balay   PetscFunctionBegin;
1989e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1990e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
19914e2b4712SSatish Balay 
1992aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
19932853dc0eSBarry Smith   {
199487828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
19952853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
19962853dc0eSBarry Smith   }
1997aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
19982853dc0eSBarry Smith   {
199987828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
20002853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
20012853dc0eSBarry Smith   }
2002aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
20032853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2004e1293385SBarry Smith #else
200530d4dcafSBarry Smith   {
200687828ca2SBarry Smith     PetscScalar  s1,s2,s3,s4,x1,x2,x3,x4;
20073f1db9ecSBarry Smith     MatScalar    *v;
20084e555682SBarry Smith     int          jdx,idt,idx,nz,*vi,i,ai16;
2009e1293385SBarry Smith 
20104e2b4712SSatish Balay   /* forward solve the lower triangular */
20114e2b4712SSatish Balay   idx    = 0;
2012e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
20134e2b4712SSatish Balay   for (i=1; i<n; i++) {
20144e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
20154e2b4712SSatish Balay     vi    =  aj      + ai[i];
20164e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2017e1293385SBarry Smith     idx   +=  4;
2018f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
20194e2b4712SSatish Balay     while (nz--) {
20204e2b4712SSatish Balay       jdx   = 4*(*vi++);
20214e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2022f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2023f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2024f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2025f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
20264e2b4712SSatish Balay       v    += 16;
20274e2b4712SSatish Balay     }
2028f1af5d2fSBarry Smith     x[idx]   = s1;
2029f1af5d2fSBarry Smith     x[1+idx] = s2;
2030f1af5d2fSBarry Smith     x[2+idx] = s3;
2031f1af5d2fSBarry Smith     x[3+idx] = s4;
20324e2b4712SSatish Balay   }
20334e2b4712SSatish Balay   /* backward solve the upper triangular */
20344e555682SBarry Smith   idt = 4*(n-1);
20354e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
20364e555682SBarry Smith     ai16 = 16*diag[i];
20374e555682SBarry Smith     v    = aa + ai16 + 16;
20384e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
20394e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2040f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2041f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
20424e2b4712SSatish Balay     while (nz--) {
20434e2b4712SSatish Balay       idx   = 4*(*vi++);
20444e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2045f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2046f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2047f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2048f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
20494e2b4712SSatish Balay       v    += 16;
20504e2b4712SSatish Balay     }
20514e555682SBarry Smith     v        = aa + ai16;
2052f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2053f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2054f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2055f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2056329f5518SBarry Smith     idt -= 4;
20574e2b4712SSatish Balay   }
205830d4dcafSBarry Smith   }
2059e1293385SBarry Smith #endif
20604e2b4712SSatish Balay 
2061e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2062e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2063b0a32e0cSBarry Smith   PetscLogFlops(2*16*(a->nz) - 4*A->n);
20644e2b4712SSatish Balay   PetscFunctionReturn(0);
20654e2b4712SSatish Balay }
20664e2b4712SSatish Balay 
20673660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
20683660e330SKris Buschelman 
20693660e330SKris Buschelman #include PETSC_HAVE_SSE
20703660e330SKris Buschelman 
20713660e330SKris Buschelman #undef __FUNCT__
20723660e330SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
20733660e330SKris Buschelman int MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
20743660e330SKris Buschelman {
20753660e330SKris Buschelman   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
20763660e330SKris Buschelman   int             n=a->mbs,*ai=a->i,*aj=a->j;
20773660e330SKris Buschelman   int             ierr,*diag = a->diag;
20783660e330SKris Buschelman   MatScalar       *aa=a->a;
207987828ca2SBarry Smith   PetscScalar     *x,*b;
20803660e330SKris Buschelman 
20813660e330SKris Buschelman   PetscFunctionBegin;
20823660e330SKris Buschelman   SSE_SCOPE_BEGIN;
20833660e330SKris Buschelman   /*
20843660e330SKris Buschelman      Note: This code currently uses demotion of double
20853660e330SKris Buschelman      to float when performing the mixed-mode computation.
20863660e330SKris Buschelman      This may not be numerically reasonable for all applications.
20873660e330SKris Buschelman   */
20883660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
20893660e330SKris Buschelman 
20903660e330SKris Buschelman   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
20913660e330SKris Buschelman   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
20923660e330SKris Buschelman   {
20933660e330SKris Buschelman     MatScalar     *v;
20943660e330SKris Buschelman     int           jdx,idt,idx,nz,*vi,i,ai16;
20953660e330SKris Buschelman 
20963660e330SKris Buschelman     /* Make space in temp stack for 16 Byte Aligned arrays */
20973660e330SKris Buschelman     float         ssealignedspace[11],*tmps,*tmpx;
20983660e330SKris Buschelman     unsigned long offset = (unsigned long)ssealignedspace % 16;
20993660e330SKris Buschelman     if (offset) offset = (16 - offset)/4;
21003660e330SKris Buschelman     tmps = &ssealignedspace[offset];
21013660e330SKris Buschelman     tmpx = &ssealignedspace[offset+4];
21023660e330SKris Buschelman 
21033660e330SKris Buschelman   /* forward solve the lower triangular */
21043660e330SKris Buschelman     idx  = 0;
21053660e330SKris Buschelman     x[0] = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
21063660e330SKris Buschelman     v    =  aa + 16*ai[1];
21073660e330SKris Buschelman 
21083660e330SKris Buschelman     for (i=1; i<n;) {
21093660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
21103660e330SKris Buschelman       vi   =  aj      + ai[i];
21113660e330SKris Buschelman       nz   =  diag[i] - ai[i];
21123660e330SKris Buschelman       idx +=  4;
21133660e330SKris Buschelman 
21143660e330SKris Buschelman     /* Demote sum from double to float */
21153660e330SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
21163660e330SKris Buschelman       LOAD_PS(tmps,XMM7);
21173660e330SKris Buschelman 
21183660e330SKris Buschelman       while (nz--) {
21193660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
21203660e330SKris Buschelman         jdx = 4*(*vi++);
21213660e330SKris Buschelman 
21223660e330SKris Buschelman         /* Demote solution (so far) from double to float */
21233660e330SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[jdx]);
21243660e330SKris Buschelman 
21253660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
21263660e330SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
21273660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
21283660e330SKris Buschelman 
21293660e330SKris Buschelman           /* First Column */
21303660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
21313660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
21323660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
21333660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
21343660e330SKris Buschelman 
21353660e330SKris Buschelman           /* Second Column */
21363660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
21373660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
21383660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
21393660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
21403660e330SKris Buschelman 
21413660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
21423660e330SKris Buschelman 
21433660e330SKris Buschelman           /* Third Column */
21443660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
21453660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
21463660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
21473660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
21483660e330SKris Buschelman 
21493660e330SKris Buschelman           /* Fourth Column */
21503660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
21513660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
21523660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
21533660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
21543660e330SKris Buschelman         SSE_INLINE_END_2
21553660e330SKris Buschelman 
21563660e330SKris Buschelman         v  += 16;
21573660e330SKris Buschelman       }
21583660e330SKris Buschelman       v    =  aa + 16*ai[++i];
21593660e330SKris Buschelman       PREFETCH_NTA(v);
21603660e330SKris Buschelman       STORE_PS(tmps,XMM7);
21613660e330SKris Buschelman 
21623660e330SKris Buschelman       /* Promote result from float to double */
21633660e330SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&x[idx],tmps);
21643660e330SKris Buschelman     }
21653660e330SKris Buschelman     /* backward solve the upper triangular */
21663660e330SKris Buschelman     idt  = 4*(n-1);
21673660e330SKris Buschelman     ai16 = 16*diag[n-1];
21683660e330SKris Buschelman     v    = aa + ai16 + 16;
21693660e330SKris Buschelman     for (i=n-1; i>=0;){
21703660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
21713660e330SKris Buschelman       vi = aj + diag[i] + 1;
21723660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
21733660e330SKris Buschelman 
21743660e330SKris Buschelman       /* Demote accumulator from double to float */
21753660e330SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&x[idt]);
21763660e330SKris Buschelman       LOAD_PS(tmps,XMM7);
21773660e330SKris Buschelman 
21783660e330SKris Buschelman       while (nz--) {
21793660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
21803660e330SKris Buschelman         idx = 4*(*vi++);
21813660e330SKris Buschelman 
21823660e330SKris Buschelman         /* Demote solution (so far) from double to float */
21833660e330SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
21843660e330SKris Buschelman 
21853660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
21863660e330SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
21873660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
21883660e330SKris Buschelman 
21893660e330SKris Buschelman           /* First Column */
21903660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
21913660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
21923660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
21933660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
21943660e330SKris Buschelman 
21953660e330SKris Buschelman           /* Second Column */
21963660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
21973660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
21983660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
21993660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
22003660e330SKris Buschelman 
22013660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
22023660e330SKris Buschelman 
22033660e330SKris Buschelman           /* Third Column */
22043660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
22053660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
22063660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
22073660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
22083660e330SKris Buschelman 
22093660e330SKris Buschelman           /* Fourth Column */
22103660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
22113660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
22123660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
22133660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
22143660e330SKris Buschelman         SSE_INLINE_END_2
22153660e330SKris Buschelman         v  += 16;
22163660e330SKris Buschelman       }
22173660e330SKris Buschelman       v    = aa + ai16;
22183660e330SKris Buschelman       ai16 = 16*diag[--i];
22193660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
22203660e330SKris Buschelman       /*
22213660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
22223660e330SKris Buschelman          which was inverted as part of the factorization
22233660e330SKris Buschelman       */
22243660e330SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
22253660e330SKris Buschelman         /* First Column */
22263660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
22273660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
22283660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
22293660e330SKris Buschelman 
22303660e330SKris Buschelman         /* Second Column */
22313660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
22323660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
22333660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
22343660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
22353660e330SKris Buschelman 
22363660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
22373660e330SKris Buschelman 
22383660e330SKris Buschelman         /* Third Column */
22393660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
22403660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
22413660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
22423660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
22433660e330SKris Buschelman 
22443660e330SKris Buschelman         /* Fourth Column */
22453660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
22463660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
22473660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
22483660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
22493660e330SKris Buschelman 
22503660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
22513660e330SKris Buschelman       SSE_INLINE_END_3
22523660e330SKris Buschelman 
22533660e330SKris Buschelman       /* Promote solution from float to double */
22543660e330SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&x[idt],tmps);
22553660e330SKris Buschelman 
22563660e330SKris Buschelman       v    = aa + ai16 + 16;
22573660e330SKris Buschelman       idt -= 4;
22583660e330SKris Buschelman     }
22593660e330SKris Buschelman   }
22603660e330SKris Buschelman   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
22613660e330SKris Buschelman   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22623660e330SKris Buschelman   PetscLogFlops(2*16*(a->nz) - 4*A->n);
22633660e330SKris Buschelman   SSE_SCOPE_END;
22643660e330SKris Buschelman   PetscFunctionReturn(0);
22653660e330SKris Buschelman }
22663660e330SKris Buschelman 
22673660e330SKris Buschelman #endif
22684a2ae208SSatish Balay #undef __FUNCT__
22694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
22704e2b4712SSatish Balay int MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
22714e2b4712SSatish Balay {
22724e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
22734e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
22744e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
22754e2b4712SSatish Balay   int             *diag = a->diag;
22763f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
227787828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3,*t;
22784e2b4712SSatish Balay 
22794e2b4712SSatish Balay   PetscFunctionBegin;
2280e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2281e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2282f1af5d2fSBarry Smith   t  = a->solve_work;
22834e2b4712SSatish Balay 
22844e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22854e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
22864e2b4712SSatish Balay 
22874e2b4712SSatish Balay   /* forward solve the lower triangular */
22884e2b4712SSatish Balay   idx    = 3*(*r++);
2289f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
22904e2b4712SSatish Balay   for (i=1; i<n; i++) {
22914e2b4712SSatish Balay     v     = aa + 9*ai[i];
22924e2b4712SSatish Balay     vi    = aj + ai[i];
22934e2b4712SSatish Balay     nz    = diag[i] - ai[i];
22944e2b4712SSatish Balay     idx   = 3*(*r++);
2295f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
22964e2b4712SSatish Balay     while (nz--) {
22974e2b4712SSatish Balay       idx   = 3*(*vi++);
2298f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2299f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2300f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2301f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
23024e2b4712SSatish Balay       v += 9;
23034e2b4712SSatish Balay     }
23044e2b4712SSatish Balay     idx = 3*i;
2305f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
23064e2b4712SSatish Balay   }
23074e2b4712SSatish Balay   /* backward solve the upper triangular */
23084e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
23094e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
23104e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
23114e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
23124e2b4712SSatish Balay     idt  = 3*i;
2313f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
23144e2b4712SSatish Balay     while (nz--) {
23154e2b4712SSatish Balay       idx   = 3*(*vi++);
2316f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2317f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2318f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2319f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
23204e2b4712SSatish Balay       v += 9;
23214e2b4712SSatish Balay     }
23224e2b4712SSatish Balay     idc = 3*(*c--);
23234e2b4712SSatish Balay     v   = aa + 9*diag[i];
2324f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2325f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2326f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
23274e2b4712SSatish Balay   }
23284e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23294e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2330e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2331e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2332b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
23334e2b4712SSatish Balay   PetscFunctionReturn(0);
23344e2b4712SSatish Balay }
23354e2b4712SSatish Balay 
233615091d37SBarry Smith /*
233715091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
233815091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
233915091d37SBarry Smith */
23404a2ae208SSatish Balay #undef __FUNCT__
23414a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
234215091d37SBarry Smith int MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
234315091d37SBarry Smith {
234415091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
234515091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
234615091d37SBarry Smith   int             ierr,*diag = a->diag;
234715091d37SBarry Smith   MatScalar       *aa=a->a,*v;
234887828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,s3,x1,x2,x3;
234915091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
235015091d37SBarry Smith 
235115091d37SBarry Smith   PetscFunctionBegin;
235215091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
235315091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
235415091d37SBarry Smith 
235515091d37SBarry Smith 
235615091d37SBarry Smith   /* forward solve the lower triangular */
235715091d37SBarry Smith   idx    = 0;
235815091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
235915091d37SBarry Smith   for (i=1; i<n; i++) {
236015091d37SBarry Smith     v     =  aa      + 9*ai[i];
236115091d37SBarry Smith     vi    =  aj      + ai[i];
236215091d37SBarry Smith     nz    =  diag[i] - ai[i];
236315091d37SBarry Smith     idx   +=  3;
2364f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
236515091d37SBarry Smith     while (nz--) {
236615091d37SBarry Smith       jdx   = 3*(*vi++);
236715091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2368f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2369f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2370f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
237115091d37SBarry Smith       v    += 9;
237215091d37SBarry Smith     }
2373f1af5d2fSBarry Smith     x[idx]   = s1;
2374f1af5d2fSBarry Smith     x[1+idx] = s2;
2375f1af5d2fSBarry Smith     x[2+idx] = s3;
237615091d37SBarry Smith   }
237715091d37SBarry Smith   /* backward solve the upper triangular */
237815091d37SBarry Smith   for (i=n-1; i>=0; i--){
237915091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
238015091d37SBarry Smith     vi   = aj + diag[i] + 1;
238115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
238215091d37SBarry Smith     idt  = 3*i;
2383f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2384f1af5d2fSBarry Smith     s3 = x[2+idt];
238515091d37SBarry Smith     while (nz--) {
238615091d37SBarry Smith       idx   = 3*(*vi++);
238715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2388f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2389f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2390f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
239115091d37SBarry Smith       v    += 9;
239215091d37SBarry Smith     }
239315091d37SBarry Smith     v        = aa +  9*diag[i];
2394f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2395f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2396f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
239715091d37SBarry Smith   }
239815091d37SBarry Smith 
239915091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
240015091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2401b0a32e0cSBarry Smith   PetscLogFlops(2*9*(a->nz) - 3*A->n);
240215091d37SBarry Smith   PetscFunctionReturn(0);
240315091d37SBarry Smith }
240415091d37SBarry Smith 
24054a2ae208SSatish Balay #undef __FUNCT__
24064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
24074e2b4712SSatish Balay int MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
24084e2b4712SSatish Balay {
24094e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
24104e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
24114e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,*rout,*cout;
24124e2b4712SSatish Balay   int             *diag = a->diag;
24133f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
241487828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2,*t;
24154e2b4712SSatish Balay 
24164e2b4712SSatish Balay   PetscFunctionBegin;
2417e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2418e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2419f1af5d2fSBarry Smith   t  = a->solve_work;
24204e2b4712SSatish Balay 
24214e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
24224e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
24234e2b4712SSatish Balay 
24244e2b4712SSatish Balay   /* forward solve the lower triangular */
24254e2b4712SSatish Balay   idx    = 2*(*r++);
2426f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
24274e2b4712SSatish Balay   for (i=1; i<n; i++) {
24284e2b4712SSatish Balay     v     = aa + 4*ai[i];
24294e2b4712SSatish Balay     vi    = aj + ai[i];
24304e2b4712SSatish Balay     nz    = diag[i] - ai[i];
24314e2b4712SSatish Balay     idx   = 2*(*r++);
2432f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
24334e2b4712SSatish Balay     while (nz--) {
24344e2b4712SSatish Balay       idx   = 2*(*vi++);
2435f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2436f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2437f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
24384e2b4712SSatish Balay       v += 4;
24394e2b4712SSatish Balay     }
24404e2b4712SSatish Balay     idx = 2*i;
2441f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
24424e2b4712SSatish Balay   }
24434e2b4712SSatish Balay   /* backward solve the upper triangular */
24444e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
24454e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
24464e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
24474e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
24484e2b4712SSatish Balay     idt  = 2*i;
2449f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
24504e2b4712SSatish Balay     while (nz--) {
24514e2b4712SSatish Balay       idx   = 2*(*vi++);
2452f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2453f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2454f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
24554e2b4712SSatish Balay       v += 4;
24564e2b4712SSatish Balay     }
24574e2b4712SSatish Balay     idc = 2*(*c--);
24584e2b4712SSatish Balay     v   = aa + 4*diag[i];
2459f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2460f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
24614e2b4712SSatish Balay   }
24624e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
24634e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2464e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2465e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2466b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
24674e2b4712SSatish Balay   PetscFunctionReturn(0);
24684e2b4712SSatish Balay }
24694e2b4712SSatish Balay 
247015091d37SBarry Smith /*
247115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
247215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
247315091d37SBarry Smith */
24744a2ae208SSatish Balay #undef __FUNCT__
24754a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
247615091d37SBarry Smith int MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
247715091d37SBarry Smith {
247815091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
247915091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
248015091d37SBarry Smith   int             ierr,*diag = a->diag;
248115091d37SBarry Smith   MatScalar       *aa=a->a,*v;
248287828ca2SBarry Smith   PetscScalar     *x,*b,s1,s2,x1,x2;
248315091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
248415091d37SBarry Smith 
248515091d37SBarry Smith   PetscFunctionBegin;
248615091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
248715091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
248815091d37SBarry Smith 
248915091d37SBarry Smith   /* forward solve the lower triangular */
249015091d37SBarry Smith   idx    = 0;
249115091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
249215091d37SBarry Smith   for (i=1; i<n; i++) {
249315091d37SBarry Smith     v     =  aa      + 4*ai[i];
249415091d37SBarry Smith     vi    =  aj      + ai[i];
249515091d37SBarry Smith     nz    =  diag[i] - ai[i];
249615091d37SBarry Smith     idx   +=  2;
2497f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
249815091d37SBarry Smith     while (nz--) {
249915091d37SBarry Smith       jdx   = 2*(*vi++);
250015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2501f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2502f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
250315091d37SBarry Smith       v    += 4;
250415091d37SBarry Smith     }
2505f1af5d2fSBarry Smith     x[idx]   = s1;
2506f1af5d2fSBarry Smith     x[1+idx] = s2;
250715091d37SBarry Smith   }
250815091d37SBarry Smith   /* backward solve the upper triangular */
250915091d37SBarry Smith   for (i=n-1; i>=0; i--){
251015091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
251115091d37SBarry Smith     vi   = aj + diag[i] + 1;
251215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
251315091d37SBarry Smith     idt  = 2*i;
2514f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
251515091d37SBarry Smith     while (nz--) {
251615091d37SBarry Smith       idx   = 2*(*vi++);
251715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2518f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2519f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
252015091d37SBarry Smith       v    += 4;
252115091d37SBarry Smith     }
252215091d37SBarry Smith     v        = aa +  4*diag[i];
2523f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2524f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
252515091d37SBarry Smith   }
252615091d37SBarry Smith 
252715091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
252815091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2529b0a32e0cSBarry Smith   PetscLogFlops(2*4*(a->nz) - 2*A->n);
253015091d37SBarry Smith   PetscFunctionReturn(0);
253115091d37SBarry Smith }
253215091d37SBarry Smith 
25334a2ae208SSatish Balay #undef __FUNCT__
25344a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
25354e2b4712SSatish Balay int MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
25364e2b4712SSatish Balay {
25374e2b4712SSatish Balay   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ *)A->data;
25384e2b4712SSatish Balay   IS              iscol=a->col,isrow=a->row;
25394e2b4712SSatish Balay   int             *r,*c,ierr,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,*rout,*cout;
25404e2b4712SSatish Balay   int             *diag = a->diag;
25413f1db9ecSBarry Smith   MatScalar       *aa=a->a,*v;
254287828ca2SBarry Smith   PetscScalar     *x,*b,s1,*t;
25434e2b4712SSatish Balay 
25444e2b4712SSatish Balay   PetscFunctionBegin;
25454e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
25464e2b4712SSatish Balay 
2547e1311b90SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2548e1311b90SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2549f1af5d2fSBarry Smith   t  = a->solve_work;
25504e2b4712SSatish Balay 
25514e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25524e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25534e2b4712SSatish Balay 
25544e2b4712SSatish Balay   /* forward solve the lower triangular */
2555f1af5d2fSBarry Smith   t[0] = b[*r++];
25564e2b4712SSatish Balay   for (i=1; i<n; i++) {
25574e2b4712SSatish Balay     v     = aa + ai[i];
25584e2b4712SSatish Balay     vi    = aj + ai[i];
25594e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2560f1af5d2fSBarry Smith     s1  = b[*r++];
25614e2b4712SSatish Balay     while (nz--) {
2562f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
25634e2b4712SSatish Balay     }
2564f1af5d2fSBarry Smith     t[i] = s1;
25654e2b4712SSatish Balay   }
25664e2b4712SSatish Balay   /* backward solve the upper triangular */
25674e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
25684e2b4712SSatish Balay     v    = aa + diag[i] + 1;
25694e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
25704e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2571f1af5d2fSBarry Smith     s1 = t[i];
25724e2b4712SSatish Balay     while (nz--) {
2573f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
25744e2b4712SSatish Balay     }
2575f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
25764e2b4712SSatish Balay   }
25774e2b4712SSatish Balay 
25784e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
25794e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2580e1311b90SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2581e1311b90SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2582b0a32e0cSBarry Smith   PetscLogFlops(2*1*(a->nz) - A->n);
25834e2b4712SSatish Balay   PetscFunctionReturn(0);
25844e2b4712SSatish Balay }
258515091d37SBarry Smith /*
258615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
258715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
258815091d37SBarry Smith */
25894a2ae208SSatish Balay #undef __FUNCT__
25904a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
259115091d37SBarry Smith int MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
259215091d37SBarry Smith {
259315091d37SBarry Smith   Mat_SeqBAIJ     *a = (Mat_SeqBAIJ *)A->data;
259415091d37SBarry Smith   int             n=a->mbs,*ai=a->i,*aj=a->j;
259515091d37SBarry Smith   int             ierr,*diag = a->diag;
259615091d37SBarry Smith   MatScalar       *aa=a->a;
259787828ca2SBarry Smith   PetscScalar     *x,*b;
259887828ca2SBarry Smith   PetscScalar     s1,x1;
259915091d37SBarry Smith   MatScalar       *v;
260015091d37SBarry Smith   int             jdx,idt,idx,nz,*vi,i;
260115091d37SBarry Smith 
260215091d37SBarry Smith   PetscFunctionBegin;
260315091d37SBarry Smith   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
260415091d37SBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260515091d37SBarry Smith 
260615091d37SBarry Smith   /* forward solve the lower triangular */
260715091d37SBarry Smith   idx    = 0;
260815091d37SBarry Smith   x[0]   = b[0];
260915091d37SBarry Smith   for (i=1; i<n; i++) {
261015091d37SBarry Smith     v     =  aa      + ai[i];
261115091d37SBarry Smith     vi    =  aj      + ai[i];
261215091d37SBarry Smith     nz    =  diag[i] - ai[i];
261315091d37SBarry Smith     idx   +=  1;
2614f1af5d2fSBarry Smith     s1  =  b[idx];
261515091d37SBarry Smith     while (nz--) {
261615091d37SBarry Smith       jdx   = *vi++;
261715091d37SBarry Smith       x1    = x[jdx];
2618f1af5d2fSBarry Smith       s1 -= v[0]*x1;
261915091d37SBarry Smith       v    += 1;
262015091d37SBarry Smith     }
2621f1af5d2fSBarry Smith     x[idx]   = s1;
262215091d37SBarry Smith   }
262315091d37SBarry Smith   /* backward solve the upper triangular */
262415091d37SBarry Smith   for (i=n-1; i>=0; i--){
262515091d37SBarry Smith     v    = aa + diag[i] + 1;
262615091d37SBarry Smith     vi   = aj + diag[i] + 1;
262715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
262815091d37SBarry Smith     idt  = i;
2629f1af5d2fSBarry Smith     s1 = x[idt];
263015091d37SBarry Smith     while (nz--) {
263115091d37SBarry Smith       idx   = *vi++;
263215091d37SBarry Smith       x1    = x[idx];
2633f1af5d2fSBarry Smith       s1 -= v[0]*x1;
263415091d37SBarry Smith       v    += 1;
263515091d37SBarry Smith     }
263615091d37SBarry Smith     v        = aa +  diag[i];
2637f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
263815091d37SBarry Smith   }
263915091d37SBarry Smith   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
264015091d37SBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2641b0a32e0cSBarry Smith   PetscLogFlops(2*(a->nz) - A->n);
264215091d37SBarry Smith   PetscFunctionReturn(0);
264315091d37SBarry Smith }
26444e2b4712SSatish Balay 
26454e2b4712SSatish Balay /* ----------------------------------------------------------------*/
26464e2b4712SSatish Balay /*
26474e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
26484e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
26494e2b4712SSatish Balay    Not a good example of code reuse.
26504e2b4712SSatish Balay */
2651ca44d042SBarry Smith EXTERN int MatMissingDiagonal_SeqBAIJ(Mat);
2652435faa5fSBarry Smith 
26534a2ae208SSatish Balay #undef __FUNCT__
26544a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
2655435faa5fSBarry Smith int MatILUFactorSymbolic_SeqBAIJ(Mat A,IS isrow,IS iscol,MatILUInfo *info,Mat *fact)
26564e2b4712SSatish Balay {
26574e2b4712SSatish Balay   Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b;
26584e2b4712SSatish Balay   IS          isicol;
26594e2b4712SSatish Balay   int         *r,*ic,ierr,prow,n = a->mbs,*ai = a->i,*aj = a->j;
26604e2b4712SSatish Balay   int         *ainew,*ajnew,jmax,*fill,*xi,nz,*im,*ajfill,*flev;
2661335d9088SBarry Smith   int         *dloc,idx,row,m,fm,nzf,nzi,len, realloc = 0,dcount = 0;
2662435faa5fSBarry Smith   int         incrlev,nnz,i,bs = a->bs,bs2 = a->bs2,levels,diagonal_fill;
26634533b203SBarry Smith   PetscTruth  col_identity,row_identity;
2664329f5518SBarry Smith   PetscReal   f;
26654e2b4712SSatish Balay 
26664e2b4712SSatish Balay   PetscFunctionBegin;
2667435faa5fSBarry Smith   if (info) {
2668435faa5fSBarry Smith     f             = info->fill;
2669335d9088SBarry Smith     levels        = (int)info->levels;
2670335d9088SBarry Smith     diagonal_fill = (int)info->diagonal_fill;
2671435faa5fSBarry Smith   } else {
2672435faa5fSBarry Smith     f             = 1.0;
2673435faa5fSBarry Smith     levels        = 0;
2674435faa5fSBarry Smith     diagonal_fill = 0;
2675435faa5fSBarry Smith   }
26764c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
2677667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
2678667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
2679309c388cSBarry Smith 
2680309c388cSBarry Smith   if (!levels && row_identity && col_identity) {  /* special case copy the nonzero structure */
2681bb3d539aSBarry Smith     ierr = MatDuplicate_SeqBAIJ(A,MAT_DO_NOT_COPY_VALUES,fact);CHKERRQ(ierr);
2682bb3d539aSBarry Smith     (*fact)->factor = FACTOR_LU;
2683bb3d539aSBarry Smith     b               = (Mat_SeqBAIJ*)(*fact)->data;
2684bb3d539aSBarry Smith     if (!b->diag) {
2685bb3d539aSBarry Smith       ierr = MatMarkDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2686bb3d539aSBarry Smith     }
2687bb3d539aSBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(*fact);CHKERRQ(ierr);
2688bb3d539aSBarry Smith     b->row        = isrow;
2689bb3d539aSBarry Smith     b->col        = iscol;
2690bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2691bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2692bb3d539aSBarry Smith     b->icol       = isicol;
269387828ca2SBarry Smith     ierr          = PetscMalloc(((*fact)->m+1+b->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
2694309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
26954e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
26964e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
26974e2b4712SSatish Balay 
26984e2b4712SSatish Balay     /* get new row pointers */
2699b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&ainew);CHKERRQ(ierr);
27004e2b4712SSatish Balay     ainew[0] = 0;
27014e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
27024e2b4712SSatish Balay     jmax = (int)(f*ai[n] + 1);
270382502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajnew);CHKERRQ(ierr);
27044e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
270582502324SSatish Balay     ierr = PetscMalloc((jmax)*sizeof(int),&ajfill);CHKERRQ(ierr);
27064e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
2707b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&fill);CHKERRQ(ierr);
27084e2b4712SSatish Balay     /* im is level for each filled value */
2709b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&im);CHKERRQ(ierr);
27104e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
2711b0a32e0cSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(int),&dloc);CHKERRQ(ierr);
27124e2b4712SSatish Balay     dloc[0]  = 0;
27134e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
2714435faa5fSBarry Smith 
2715435faa5fSBarry Smith       /* copy prow into linked list */
27164e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
271729bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
27184e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
27194e2b4712SSatish Balay       fill[n]    = n;
2720435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
27214e2b4712SSatish Balay       while (nz--) {
27224e2b4712SSatish Balay 	fm  = n;
27234e2b4712SSatish Balay 	idx = ic[*xi++];
27244e2b4712SSatish Balay 	do {
27254e2b4712SSatish Balay 	  m  = fm;
27264e2b4712SSatish Balay 	  fm = fill[m];
27274e2b4712SSatish Balay 	} while (fm < idx);
27284e2b4712SSatish Balay 	fill[m]   = idx;
27294e2b4712SSatish Balay 	fill[idx] = fm;
27304e2b4712SSatish Balay 	im[idx]   = 0;
27314e2b4712SSatish Balay       }
2732435faa5fSBarry Smith 
2733435faa5fSBarry Smith       /* make sure diagonal entry is included */
2734435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
2735435faa5fSBarry Smith 	fm = n;
2736435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
2737435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
2738435faa5fSBarry Smith 	fill[fm]   = prow;
2739435faa5fSBarry Smith 	im[prow]   = 0;
2740435faa5fSBarry Smith 	nzf++;
2741335d9088SBarry Smith 	dcount++;
2742435faa5fSBarry Smith       }
2743435faa5fSBarry Smith 
27444e2b4712SSatish Balay       nzi = 0;
27454e2b4712SSatish Balay       row = fill[n];
27464e2b4712SSatish Balay       while (row < prow) {
27474e2b4712SSatish Balay 	incrlev = im[row] + 1;
27484e2b4712SSatish Balay 	nz      = dloc[row];
2749435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
27504e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
27514e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
27524e2b4712SSatish Balay 	fm      = row;
27534e2b4712SSatish Balay 	while (nnz-- > 0) {
27544e2b4712SSatish Balay 	  idx = *xi++;
27554e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
27564e2b4712SSatish Balay 	    flev++;
27574e2b4712SSatish Balay 	    continue;
27584e2b4712SSatish Balay 	  }
27594e2b4712SSatish Balay 	  do {
27604e2b4712SSatish Balay 	    m  = fm;
27614e2b4712SSatish Balay 	    fm = fill[m];
27624e2b4712SSatish Balay 	  } while (fm < idx);
27634e2b4712SSatish Balay 	  if (fm != idx) {
27644e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
27654e2b4712SSatish Balay 	    fill[m]   = idx;
27664e2b4712SSatish Balay 	    fill[idx] = fm;
27674e2b4712SSatish Balay 	    fm        = idx;
27684e2b4712SSatish Balay 	    nzf++;
2769ecf371e4SBarry Smith 	  } else {
27704e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
27714e2b4712SSatish Balay 	  }
27724e2b4712SSatish Balay 	  flev++;
27734e2b4712SSatish Balay 	}
27744e2b4712SSatish Balay 	row = fill[row];
27754e2b4712SSatish Balay 	nzi++;
27764e2b4712SSatish Balay       }
27774e2b4712SSatish Balay       /* copy new filled row into permanent storage */
27784e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
27794e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
2780ecf371e4SBarry Smith 
2781ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
2782ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
2783ecf371e4SBarry Smith 	/* just double the memory each time */
2784ecf371e4SBarry Smith 	int maxadd = jmax;
2785ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
27864e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
27874e2b4712SSatish Balay 	jmax += maxadd;
2788ecf371e4SBarry Smith 
2789ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
279082502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2791549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajnew,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2792606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
27934e2b4712SSatish Balay 	ajnew = xi;
279482502324SSatish Balay 	ierr = PetscMalloc(jmax*sizeof(int),&xi);CHKERRQ(ierr);
2795549d3d68SSatish Balay 	ierr = PetscMemcpy(xi,ajfill,ainew[prow]*sizeof(int));CHKERRQ(ierr);
2796606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
27974e2b4712SSatish Balay 	ajfill = xi;
2798ecf371e4SBarry Smith 	realloc++; /* count how many reallocations are needed */
27994e2b4712SSatish Balay       }
28004e2b4712SSatish Balay       xi          = ajnew + ainew[prow];
28014e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
28024e2b4712SSatish Balay       dloc[prow]  = nzi;
28034e2b4712SSatish Balay       fm          = fill[n];
28044e2b4712SSatish Balay       while (nzf--) {
28054e2b4712SSatish Balay 	*xi++   = fm;
28064e2b4712SSatish Balay 	*flev++ = im[fm];
28074e2b4712SSatish Balay 	fm      = fill[fm];
28084e2b4712SSatish Balay       }
2809435faa5fSBarry Smith       /* make sure row has diagonal entry */
2810435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
281129bbc08cSBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %d has missing diagonal in factored matrix\n\
2812435faa5fSBarry Smith     try running with -pc_ilu_nonzeros_along_diagonal or -pc_ilu_diagonal_fill",prow);
2813435faa5fSBarry Smith       }
28144e2b4712SSatish Balay     }
2815606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
28164e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
28174e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
2818606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
2819606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
28204e2b4712SSatish Balay 
28214e2b4712SSatish Balay     {
2822329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
2823b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Reallocs %d Fill ratio:given %g needed %g\n",realloc,f,af);
2824b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Run with -pc_ilu_fill %g or use \n",af);
2825b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:PCILUSetFill(pc,%g);\n",af);
2826b0a32e0cSBarry Smith       PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:for best performance.\n");
2827335d9088SBarry Smith       if (diagonal_fill) {
2828b0a32e0cSBarry Smith 	PetscLogInfo(A,"MatILUFactorSymbolic_SeqBAIJ:Detected and replace %d missing diagonals",dcount);
2829335d9088SBarry Smith       }
28304e2b4712SSatish Balay     }
28314e2b4712SSatish Balay 
28324e2b4712SSatish Balay     /* put together the new matrix */
28334e2b4712SSatish Balay     ierr = MatCreateSeqBAIJ(A->comm,bs,bs*n,bs*n,0,PETSC_NULL,fact);CHKERRQ(ierr);
2834b0a32e0cSBarry Smith     PetscLogObjectParent(*fact,isicol);
28354e2b4712SSatish Balay     b = (Mat_SeqBAIJ*)(*fact)->data;
2836606d414cSSatish Balay     ierr = PetscFree(b->imax);CHKERRQ(ierr);
28377c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
28383f1db9ecSBarry Smith     len = bs2*ainew[n]*sizeof(MatScalar);
28394e2b4712SSatish Balay     /* the next line frees the default space generated by the Create() */
2840606d414cSSatish Balay     ierr = PetscFree(b->a);CHKERRQ(ierr);
2841606d414cSSatish Balay     ierr = PetscFree(b->ilen);CHKERRQ(ierr);
284282502324SSatish Balay     ierr = PetscMalloc(len,&b->a);CHKERRQ(ierr);
28434e2b4712SSatish Balay     b->j          = ajnew;
28444e2b4712SSatish Balay     b->i          = ainew;
28454e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
28464e2b4712SSatish Balay     b->diag       = dloc;
28474e2b4712SSatish Balay     b->ilen       = 0;
28484e2b4712SSatish Balay     b->imax       = 0;
28494e2b4712SSatish Balay     b->row        = isrow;
28504e2b4712SSatish Balay     b->col        = iscol;
2851c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
2852c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
2853e51c0b9cSSatish Balay     b->icol       = isicol;
285487828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
28554e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
28564e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
285787828ca2SBarry Smith     PetscLogObjectMemory(*fact,(ainew[n]-n)*(sizeof(int))+bs2*ainew[n]*sizeof(PetscScalar));
28584e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
28594e2b4712SSatish Balay     (*fact)->factor   = FACTOR_LU;
28604e2b4712SSatish Balay 
28614e2b4712SSatish Balay     (*fact)->info.factor_mallocs    = realloc;
28624e2b4712SSatish Balay     (*fact)->info.fill_ratio_given  = f;
2863329f5518SBarry Smith     (*fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
2864309c388cSBarry Smith   }
28654e2b4712SSatish Balay 
2866309c388cSBarry Smith   if (row_identity && col_identity) {
2867732ee342SKris Buschelman     ierr = MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(*fact);CHKERRQ(ierr);
28688661488fSKris Buschelman   }
28698661488fSKris Buschelman   PetscFunctionReturn(0);
28708661488fSKris Buschelman }
28718661488fSKris Buschelman 
2872732ee342SKris Buschelman #undef __FUNCT__
2873732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering"
2874732ee342SKris Buschelman int MatSeqBAIJ_UpdateFactorNumeric_NaturalOrdering(Mat inA)
28758661488fSKris Buschelman {
28768661488fSKris Buschelman   /*
28778661488fSKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
28788661488fSKris Buschelman       with natural ordering
28798661488fSKris Buschelman   */
28808661488fSKris Buschelman   Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)inA->data;
28816b7cc795SKris Buschelman   PetscTruth  sse_enabled_local,sse_enabled_global;
2882732ee342SKris Buschelman   int         ierr;
28838661488fSKris Buschelman 
28848661488fSKris Buschelman   PetscFunctionBegin;
2885a7ba9c3cSKris Buschelman   inA->ops->solve             = MatSolve_SeqBAIJ_Update;
2886a7ba9c3cSKris Buschelman   inA->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_Update;
28878661488fSKris Buschelman   switch (a->bs) {
28888661488fSKris Buschelman   case 1:
28898661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_1;
2890732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=1\n");
2891732ee342SKris Buschelman     break;
2892309c388cSBarry Smith   case 2:
28938661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_2_NaturalOrdering;
2894732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=2\n");
2895309c388cSBarry Smith     break;
2896309c388cSBarry Smith   case 3:
28978661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_3_NaturalOrdering;
2898732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=3\n");
2899309c388cSBarry Smith     break;
2900309c388cSBarry Smith   case 4:
29016b7cc795SKris Buschelman     ierr = PetscSSEIsEnabled(inA->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
29026b7cc795SKris Buschelman     if (sse_enabled_local) {
2903b988c221SKris Buschelman #if defined(PETSC_HAVE_SSE)
29048661488fSKris Buschelman       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering_SSE;
2905732ee342SKris Buschelman       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special SSE, in-place natural ordering factor BS=4\n");
2906b988c221SKris Buschelman #else
2907b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
2908b988c221SKris Buschelman       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
2909b988c221SKris Buschelman #endif
29103ba47ebaSKris Buschelman     } else {
29118661488fSKris Buschelman       inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_4_NaturalOrdering;
2912732ee342SKris Buschelman       PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=4\n");
29133ba47ebaSKris Buschelman     }
2914309c388cSBarry Smith     break;
2915309c388cSBarry Smith   case 5:
29168661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_5_NaturalOrdering;
2917732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=5\n");
2918309c388cSBarry Smith     break;
2919309c388cSBarry Smith   case 6:
29208661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_6_NaturalOrdering;
2921732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=6\n");
2922309c388cSBarry Smith     break;
2923309c388cSBarry Smith   case 7:
29248661488fSKris Buschelman     inA->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_7_NaturalOrdering;
2925732ee342SKris Buschelman     PetscLogInfo(inA,"MatILUFactor_SeqBAIJ:Using special in-place natural ordering factor BS=7\n");
2926309c388cSBarry Smith     break;
2927309c388cSBarry Smith   }
29284e2b4712SSatish Balay   PetscFunctionReturn(0);
29294e2b4712SSatish Balay }
2930732ee342SKris Buschelman 
2931732ee342SKris Buschelman #undef __FUNCT__
2932732ee342SKris Buschelman #define __FUNCT__ "MatSeqBAIJ_UpdateSolvers"
2933732ee342SKris Buschelman int MatSeqBAIJ_UpdateSolvers(Mat A)
2934732ee342SKris Buschelman {
2935732ee342SKris Buschelman   /*
2936732ee342SKris Buschelman       Blocksize 2, 3, 4, 5, 6 and 7 have a special faster factorization/solver
2937732ee342SKris Buschelman       with natural ordering
2938732ee342SKris Buschelman   */
2939732ee342SKris Buschelman   Mat_SeqBAIJ *a  = (Mat_SeqBAIJ *)A->data;
2940732ee342SKris Buschelman   IS          row = a->row, col = a->col;
2941732ee342SKris Buschelman   PetscTruth  row_identity, col_identity;
294294ee7fc8SKris Buschelman   PetscTruth  sse_enabled_local, sse_enabled_global;
294394ee7fc8SKris Buschelman   PetscTruth  use_single, use_natural;
2944732ee342SKris Buschelman   int         ierr;
2945732ee342SKris Buschelman 
2946732ee342SKris Buschelman   PetscFunctionBegin;
2947cf242676SKris Buschelman 
294894ee7fc8SKris Buschelman   use_single  = PETSC_FALSE;
294994ee7fc8SKris Buschelman   use_natural = PETSC_FALSE;
2950cf242676SKris Buschelman 
2951732ee342SKris Buschelman   ierr = ISIdentity(row,&row_identity);CHKERRQ(ierr);
2952732ee342SKris Buschelman   ierr = ISIdentity(col,&col_identity);CHKERRQ(ierr);
2953732ee342SKris Buschelman 
2954732ee342SKris Buschelman   if (row_identity && col_identity) {
2955732ee342SKris Buschelman     use_natural = PETSC_TRUE;
2956732ee342SKris Buschelman   } else {
2957732ee342SKris Buschelman     use_natural = PETSC_FALSE;
2958732ee342SKris Buschelman   }
2959732ee342SKris Buschelman   switch (a->bs) {
2960732ee342SKris Buschelman   case 1:
2961732ee342SKris Buschelman     if (use_natural) {
2962732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1_NaturalOrdering;
2963732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1_NaturalOrdering;
2964732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=1\n");
2965732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2966732ee342SKris Buschelman     } else {
2967732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_1;
2968732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_1;
2969732ee342SKris Buschelman     }
2970732ee342SKris Buschelman     break;
2971732ee342SKris Buschelman   case 2:
2972732ee342SKris Buschelman     if (use_natural) {
2973732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2_NaturalOrdering;
2974732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2_NaturalOrdering;
2975732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=2\n");
2976732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2977732ee342SKris Buschelman     } else {
2978732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_2;
2979732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_2;
2980732ee342SKris Buschelman     }
2981732ee342SKris Buschelman     break;
2982732ee342SKris Buschelman   case 3:
2983732ee342SKris Buschelman     if (use_natural) {
2984732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3_NaturalOrdering;
2985732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3_NaturalOrdering;
2986732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=3\n");
2987732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
2988732ee342SKris Buschelman     } else {
2989732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_3;
2990732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_3;
2991732ee342SKris Buschelman     }
2992732ee342SKris Buschelman     break;
2993732ee342SKris Buschelman   case 4:
299494ee7fc8SKris Buschelman     ierr = PetscSSEIsEnabled(A->comm,&sse_enabled_local,&sse_enabled_global);CHKERRQ(ierr);
299594ee7fc8SKris Buschelman     if (sse_enabled_local) {
2996cf242676SKris Buschelman       PetscTruth single_prec,flg;
2997cf242676SKris Buschelman       single_prec = flg = PETSC_FALSE;
2998cf242676SKris Buschelman       ierr = PetscOptionsGetLogical(PETSC_NULL,"-mat_single_precision_solves",&single_prec,&flg);CHKERRQ(ierr);
2999cf242676SKris Buschelman       if (flg) {
3000cf242676SKris Buschelman         a->single_precision_solves = single_prec;
3001cf242676SKris Buschelman       }
3002732ee342SKris Buschelman       if (a->single_precision_solves) {
3003732ee342SKris Buschelman         use_single = PETSC_TRUE;
3004732ee342SKris Buschelman       }
3005732ee342SKris Buschelman     } else {
3006732ee342SKris Buschelman       use_single = PETSC_FALSE;
3007732ee342SKris Buschelman     }
3008732ee342SKris Buschelman     if (use_natural) {
3009732ee342SKris Buschelman       if (use_single) {
3010b988c221SKris Buschelman #if defined(PETSC_HAVE_SSE)
3011732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion;
3012732ee342SKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE, in-place natural ordering solve BS=4\n");
3013b988c221SKris Buschelman #else
3014b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3015b988c221SKris Buschelman       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3016b988c221SKris Buschelman #endif
3017732ee342SKris Buschelman       } else {
3018732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_NaturalOrdering;
3019732ee342SKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3020732ee342SKris Buschelman       }
3021732ee342SKris Buschelman       A->ops->solvetranspose    = MatSolveTranspose_SeqBAIJ_4_NaturalOrdering;
3022732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=4\n");
3023732ee342SKris Buschelman     } else {
3024732ee342SKris Buschelman       if (use_single) {
3025b988c221SKris Buschelman #if defined(PETSC_HAVE_SSE)
3026732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4_SSE_Demotion;
3027732ee342SKris Buschelman         PetscLogInfo(A,"MatSolve_SeqBAIJ:Using single precision, SSE solve BS=4\n");
3028b988c221SKris Buschelman #else
3029b988c221SKris Buschelman       /* This should never be reached.  If so, problem in PetscSSEIsEnabled. */
3030b988c221SKris Buschelman       SETERRQ(PETSC_ERR_SUP,"SSE Hardware unavailable");
3031b988c221SKris Buschelman #endif
3032732ee342SKris Buschelman       } else {
3033732ee342SKris Buschelman         A->ops->solve           = MatSolve_SeqBAIJ_4;
3034732ee342SKris Buschelman       }
3035732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_4;
3036732ee342SKris Buschelman     }
3037732ee342SKris Buschelman     break;
3038732ee342SKris Buschelman   case 5:
3039732ee342SKris Buschelman     if (use_natural) {
3040732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5_NaturalOrdering;
3041732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5_NaturalOrdering;
3042732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3043732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=5\n");
3044732ee342SKris Buschelman     } else {
3045732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_5;
3046732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_5;
3047732ee342SKris Buschelman     }
3048732ee342SKris Buschelman     break;
3049732ee342SKris Buschelman   case 6:
3050732ee342SKris Buschelman     if (use_natural) {
3051732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6_NaturalOrdering;
3052732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6_NaturalOrdering;
3053732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3054732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=6\n");
3055732ee342SKris Buschelman     } else {
3056732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_6;
3057732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_6;
3058732ee342SKris Buschelman     }
3059732ee342SKris Buschelman     break;
3060732ee342SKris Buschelman   case 7:
3061732ee342SKris Buschelman     if (use_natural) {
3062732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7_NaturalOrdering;
3063732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7_NaturalOrdering;
3064732ee342SKris Buschelman       PetscLogInfo(A,"MatSolve_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3065732ee342SKris Buschelman       PetscLogInfo(A,"MatSolveTranspose_SeqBAIJ:Using special in-place natural ordering solve BS=7\n");
3066732ee342SKris Buschelman     } else {
3067732ee342SKris Buschelman       A->ops->solve           = MatSolve_SeqBAIJ_7;
3068732ee342SKris Buschelman       A->ops->solvetranspose  = MatSolveTranspose_SeqBAIJ_7;
3069732ee342SKris Buschelman     }
3070732ee342SKris Buschelman     break;
307131801e53SKris Buschelman   default:
307231801e53SKris Buschelman     A->ops->solve             = MatSolve_SeqBAIJ_N;
307331801e53SKris Buschelman     break;
3074732ee342SKris Buschelman   }
3075732ee342SKris Buschelman   PetscFunctionReturn(0);
3076732ee342SKris Buschelman }
3077732ee342SKris Buschelman 
3078732ee342SKris Buschelman #undef __FUNCT__
3079732ee342SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_Update"
3080732ee342SKris Buschelman int MatSolve_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3081732ee342SKris Buschelman   int ierr;
3082732ee342SKris Buschelman 
3083732ee342SKris Buschelman   PetscFunctionBegin;
3084732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3085cf242676SKris Buschelman   if (A->ops->solve != MatSolve_SeqBAIJ_Update) {
3086732ee342SKris Buschelman     ierr = (*A->ops->solve)(A,x,y);CHKERRQ(ierr);
3087cf242676SKris Buschelman   } else {
3088cf242676SKris Buschelman     SETERRQ(PETSC_ERR_SUP,"Something really wrong happened.");
3089cf242676SKris Buschelman   }
3090732ee342SKris Buschelman   PetscFunctionReturn(0);
3091732ee342SKris Buschelman }
3092732ee342SKris Buschelman 
3093732ee342SKris Buschelman #undef __FUNCT__
3094732ee342SKris Buschelman #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_Update"
3095732ee342SKris Buschelman int MatSolveTranspose_SeqBAIJ_Update(Mat A,Vec x,Vec y) {
3096732ee342SKris Buschelman   int ierr;
3097732ee342SKris Buschelman 
3098732ee342SKris Buschelman   PetscFunctionBegin;
3099732ee342SKris Buschelman   ierr = MatSeqBAIJ_UpdateSolvers(A);
3100732ee342SKris Buschelman   ierr = (*A->ops->solvetranspose)(A,x,y);CHKERRQ(ierr);
3101732ee342SKris Buschelman   PetscFunctionReturn(0);
3102732ee342SKris Buschelman }
3103732ee342SKris Buschelman 
3104732ee342SKris Buschelman 
3105