xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision b2b2dd246975d7f9c8a1571def503d28e659d8b1)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120f1af5d2fSBarry Smith {
121f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122dfbe8321SBarry Smith   PetscErrorCode ierr;
123690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
125f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12787828ca2SBarry Smith   PetscScalar    *x,*b;
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   PetscFunctionBegin;
130ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith   /* forward solve the U^T */
135f1af5d2fSBarry Smith   idx = 0;
136f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
139f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
140ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144f1af5d2fSBarry Smith     v += 9;
145f1af5d2fSBarry Smith 
146f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
147f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
148f1af5d2fSBarry Smith     while (nz--) {
149f1af5d2fSBarry Smith       oidx = 3*(*vi++);
150f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153f1af5d2fSBarry Smith       v  += 9;
154f1af5d2fSBarry Smith     }
155f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156f1af5d2fSBarry Smith     idx += 3;
157f1af5d2fSBarry Smith   }
158f1af5d2fSBarry Smith   /* backward solve the L^T */
159f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
160f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
161f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
162f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
163f1af5d2fSBarry Smith     idt  = 3*i;
164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165f1af5d2fSBarry Smith     while (nz--) {
166f1af5d2fSBarry Smith       idx   = 3*(*vi--);
167f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170f1af5d2fSBarry Smith       v -= 9;
171f1af5d2fSBarry Smith     }
172f1af5d2fSBarry Smith   }
1731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176f1af5d2fSBarry Smith   PetscFunctionReturn(0);
177f1af5d2fSBarry Smith }
178f1af5d2fSBarry Smith 
1794a2ae208SSatish Balay #undef __FUNCT__
1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182f1af5d2fSBarry Smith {
183f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184dfbe8321SBarry Smith   PetscErrorCode ierr;
185690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
187f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18987828ca2SBarry Smith   PetscScalar    *x,*b;
190f1af5d2fSBarry Smith 
191f1af5d2fSBarry Smith   PetscFunctionBegin;
192ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith   /* forward solve the U^T */
197f1af5d2fSBarry Smith   idx = 0;
198f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
199f1af5d2fSBarry Smith 
200f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
201f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
202ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207f1af5d2fSBarry Smith     v += 16;
208f1af5d2fSBarry Smith 
209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
211f1af5d2fSBarry Smith     while (nz--) {
212f1af5d2fSBarry Smith       oidx = 4*(*vi++);
213f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217f1af5d2fSBarry Smith       v  += 16;
218f1af5d2fSBarry Smith     }
219f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220f1af5d2fSBarry Smith     idx += 4;
221f1af5d2fSBarry Smith   }
222f1af5d2fSBarry Smith   /* backward solve the L^T */
223f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
224f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
225f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
226f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
227f1af5d2fSBarry Smith     idt  = 4*i;
228f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229f1af5d2fSBarry Smith     while (nz--) {
230f1af5d2fSBarry Smith       idx   = 4*(*vi--);
231f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235f1af5d2fSBarry Smith       v -= 16;
236f1af5d2fSBarry Smith     }
237f1af5d2fSBarry Smith   }
2381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241f1af5d2fSBarry Smith   PetscFunctionReturn(0);
242f1af5d2fSBarry Smith }
243f1af5d2fSBarry Smith 
2444a2ae208SSatish Balay #undef __FUNCT__
2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247f1af5d2fSBarry Smith {
248f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249dfbe8321SBarry Smith   PetscErrorCode ierr;
250690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
252f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25487828ca2SBarry Smith   PetscScalar    *x,*b;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith   PetscFunctionBegin;
257ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2581ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2591ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith   /* forward solve the U^T */
262f1af5d2fSBarry Smith   idx = 0;
263f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
266f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
267ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273f1af5d2fSBarry Smith     v += 25;
274f1af5d2fSBarry Smith 
275f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
276f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
277f1af5d2fSBarry Smith     while (nz--) {
278f1af5d2fSBarry Smith       oidx = 5*(*vi++);
279f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284f1af5d2fSBarry Smith       v  += 25;
285f1af5d2fSBarry Smith     }
286f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287f1af5d2fSBarry Smith     idx += 5;
288f1af5d2fSBarry Smith   }
289f1af5d2fSBarry Smith   /* backward solve the L^T */
290f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
291f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
292f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
293f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
294f1af5d2fSBarry Smith     idt  = 5*i;
295f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296f1af5d2fSBarry Smith     while (nz--) {
297f1af5d2fSBarry Smith       idx   = 5*(*vi--);
298f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303f1af5d2fSBarry Smith       v -= 25;
304f1af5d2fSBarry Smith     }
305f1af5d2fSBarry Smith   }
3061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309f1af5d2fSBarry Smith   PetscFunctionReturn(0);
310f1af5d2fSBarry Smith }
311f1af5d2fSBarry Smith 
3124a2ae208SSatish Balay #undef __FUNCT__
3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315f1af5d2fSBarry Smith {
316f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317dfbe8321SBarry Smith   PetscErrorCode ierr;
318690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
320f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
32187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32287828ca2SBarry Smith   PetscScalar    *x,*b;
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith   PetscFunctionBegin;
325ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith   /* forward solve the U^T */
330f1af5d2fSBarry Smith   idx = 0;
331f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
332f1af5d2fSBarry Smith 
333f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
334f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
335ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336ef66eb69SBarry Smith     x6    = x[5+idx];
337f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343f1af5d2fSBarry Smith     v += 36;
344f1af5d2fSBarry Smith 
345f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
346f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
347f1af5d2fSBarry Smith     while (nz--) {
348f1af5d2fSBarry Smith       oidx = 6*(*vi++);
349f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355f1af5d2fSBarry Smith       v  += 36;
356f1af5d2fSBarry Smith     }
357f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358f1af5d2fSBarry Smith     x[5+idx] = s6;
359f1af5d2fSBarry Smith     idx += 6;
360f1af5d2fSBarry Smith   }
361f1af5d2fSBarry Smith   /* backward solve the L^T */
362f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
363f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
364f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
365f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
366f1af5d2fSBarry Smith     idt  = 6*i;
367f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368f1af5d2fSBarry Smith     s6 = x[5+idt];
369f1af5d2fSBarry Smith     while (nz--) {
370f1af5d2fSBarry Smith       idx   = 6*(*vi--);
371f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377f1af5d2fSBarry Smith       v -= 36;
378f1af5d2fSBarry Smith     }
379f1af5d2fSBarry Smith   }
3801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383f1af5d2fSBarry Smith   PetscFunctionReturn(0);
384f1af5d2fSBarry Smith }
385f1af5d2fSBarry Smith 
3864a2ae208SSatish Balay #undef __FUNCT__
3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389f1af5d2fSBarry Smith {
390f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391dfbe8321SBarry Smith   PetscErrorCode ierr;
392690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
394f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39687828ca2SBarry Smith   PetscScalar    *x,*b;
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith   PetscFunctionBegin;
399ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith   /* forward solve the U^T */
404f1af5d2fSBarry Smith   idx = 0;
405f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
406f1af5d2fSBarry Smith 
407f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
408f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
409ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
411f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418f1af5d2fSBarry Smith     v += 49;
419f1af5d2fSBarry Smith 
420f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
421f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
422f1af5d2fSBarry Smith     while (nz--) {
423f1af5d2fSBarry Smith       oidx = 7*(*vi++);
424f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431f1af5d2fSBarry Smith       v  += 49;
432f1af5d2fSBarry Smith     }
433f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
435f1af5d2fSBarry Smith     idx += 7;
436f1af5d2fSBarry Smith   }
437f1af5d2fSBarry Smith   /* backward solve the L^T */
438f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
439f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
440f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
441f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
442f1af5d2fSBarry Smith     idt  = 7*i;
443f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
445f1af5d2fSBarry Smith     while (nz--) {
446f1af5d2fSBarry Smith       idx   = 7*(*vi--);
447f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454f1af5d2fSBarry Smith       v -= 49;
455f1af5d2fSBarry Smith     }
456f1af5d2fSBarry Smith   }
4571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460f1af5d2fSBarry Smith   PetscFunctionReturn(0);
461f1af5d2fSBarry Smith }
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4644a2ae208SSatish Balay #undef __FUNCT__
4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467f1af5d2fSBarry Smith {
468f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
4715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473690b6cddSBarry Smith   PetscInt       *diag = a->diag;
474f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47587828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   PetscFunctionBegin;
4781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480f1af5d2fSBarry Smith   t  = a->solve_work;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith     t[i] = b[c[i]];
488f1af5d2fSBarry Smith   }
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith   /* forward solve the U^T */
491f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith     v     = aa + diag[i];
494f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
495f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith     t[i]   = s1;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     s1   = t[i];
509f1af5d2fSBarry Smith     while (nz--) {
510f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   /* copy t into x according to permutation */
515f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
516f1af5d2fSBarry Smith     x[r[i]]   = t[i];
517f1af5d2fSBarry Smith   }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5211ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   PetscFunctionReturn(0);
525f1af5d2fSBarry Smith }
526f1af5d2fSBarry Smith 
5274a2ae208SSatish Balay #undef __FUNCT__
5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530f1af5d2fSBarry Smith {
531f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5336849ba73SBarry Smith   PetscErrorCode ierr;
5345d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53887828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   PetscFunctionBegin;
5421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544f1af5d2fSBarry Smith   t  = a->solve_work;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
550f1af5d2fSBarry Smith   ii = 0;
551f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
552f1af5d2fSBarry Smith     ic      = 2*c[i];
553f1af5d2fSBarry Smith     t[ii]   = b[ic];
554f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
555f1af5d2fSBarry Smith     ii += 2;
556f1af5d2fSBarry Smith   }
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith   /* forward solve the U^T */
559f1af5d2fSBarry Smith   idx = 0;
560f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
561f1af5d2fSBarry Smith 
562f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
563f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
564f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
565f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
566f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
567f1af5d2fSBarry Smith     v += 4;
568f1af5d2fSBarry Smith 
569f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
570f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
571f1af5d2fSBarry Smith     while (nz--) {
572f1af5d2fSBarry Smith       oidx = 2*(*vi++);
573f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575f1af5d2fSBarry Smith       v  += 4;
576f1af5d2fSBarry Smith     }
577f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
578f1af5d2fSBarry Smith     idx += 2;
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith   /* backward solve the L^T */
581f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
582f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
583f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
584f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
585f1af5d2fSBarry Smith     idt  = 2*i;
586f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
587f1af5d2fSBarry Smith     while (nz--) {
588f1af5d2fSBarry Smith       idx   = 2*(*vi--);
589f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591f1af5d2fSBarry Smith       v -= 4;
592f1af5d2fSBarry Smith     }
593f1af5d2fSBarry Smith   }
594f1af5d2fSBarry Smith 
595f1af5d2fSBarry Smith   /* copy t into x according to permutation */
596f1af5d2fSBarry Smith   ii = 0;
597f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
598f1af5d2fSBarry Smith     ir      = 2*r[i];
599f1af5d2fSBarry Smith     x[ir]   = t[ii];
600f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
601f1af5d2fSBarry Smith     ii += 2;
602f1af5d2fSBarry Smith   }
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609f1af5d2fSBarry Smith   PetscFunctionReturn(0);
610f1af5d2fSBarry Smith }
611f1af5d2fSBarry Smith 
6124a2ae208SSatish Balay #undef __FUNCT__
6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615f1af5d2fSBarry Smith {
616f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6186849ba73SBarry Smith   PetscErrorCode ierr;
6195d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   PetscFunctionBegin;
6271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629f1af5d2fSBarry Smith   t  = a->solve_work;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
635f1af5d2fSBarry Smith   ii = 0;
636f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
637f1af5d2fSBarry Smith     ic      = 3*c[i];
638f1af5d2fSBarry Smith     t[ii]   = b[ic];
639f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
640f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
641f1af5d2fSBarry Smith     ii += 3;
642f1af5d2fSBarry Smith   }
643f1af5d2fSBarry Smith 
644f1af5d2fSBarry Smith   /* forward solve the U^T */
645f1af5d2fSBarry Smith   idx = 0;
646f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
647f1af5d2fSBarry Smith 
648f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
649f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
650f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654f1af5d2fSBarry Smith     v += 9;
655f1af5d2fSBarry Smith 
656f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
657f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
658f1af5d2fSBarry Smith     while (nz--) {
659f1af5d2fSBarry Smith       oidx = 3*(*vi++);
660f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663f1af5d2fSBarry Smith       v  += 9;
664f1af5d2fSBarry Smith     }
665f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666f1af5d2fSBarry Smith     idx += 3;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 3*i;
674f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 3*(*vi--);
677f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680f1af5d2fSBarry Smith       v -= 9;
681f1af5d2fSBarry Smith     }
682f1af5d2fSBarry Smith   }
683f1af5d2fSBarry Smith 
684f1af5d2fSBarry Smith   /* copy t into x according to permutation */
685f1af5d2fSBarry Smith   ii = 0;
686f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
687f1af5d2fSBarry Smith     ir      = 3*r[i];
688f1af5d2fSBarry Smith     x[ir]   = t[ii];
689f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
690f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
691f1af5d2fSBarry Smith     ii += 3;
692f1af5d2fSBarry Smith   }
693f1af5d2fSBarry Smith 
694f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699f1af5d2fSBarry Smith   PetscFunctionReturn(0);
700f1af5d2fSBarry Smith }
701f1af5d2fSBarry Smith 
7024a2ae208SSatish Balay #undef __FUNCT__
7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705f1af5d2fSBarry Smith {
706f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7086849ba73SBarry Smith   PetscErrorCode ierr;
7095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   PetscFunctionBegin;
7171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719f1af5d2fSBarry Smith   t  = a->solve_work;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
725f1af5d2fSBarry Smith   ii = 0;
726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
727f1af5d2fSBarry Smith     ic      = 4*c[i];
728f1af5d2fSBarry Smith     t[ii]   = b[ic];
729f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
730f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
731f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
732f1af5d2fSBarry Smith     ii += 4;
733f1af5d2fSBarry Smith   }
734f1af5d2fSBarry Smith 
735f1af5d2fSBarry Smith   /* forward solve the U^T */
736f1af5d2fSBarry Smith   idx = 0;
737f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
738f1af5d2fSBarry Smith 
739f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
740f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
741f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746f1af5d2fSBarry Smith     v += 16;
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
749f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       oidx = 4*(*vi++);
752f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v  += 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759f1af5d2fSBarry Smith     idx += 4;
760f1af5d2fSBarry Smith   }
761f1af5d2fSBarry Smith   /* backward solve the L^T */
762f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
763f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
764f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
765f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
766f1af5d2fSBarry Smith     idt  = 4*i;
767f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768f1af5d2fSBarry Smith     while (nz--) {
769f1af5d2fSBarry Smith       idx   = 4*(*vi--);
770f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774f1af5d2fSBarry Smith       v -= 16;
775f1af5d2fSBarry Smith     }
776f1af5d2fSBarry Smith   }
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   /* copy t into x according to permutation */
779f1af5d2fSBarry Smith   ii = 0;
780f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
781f1af5d2fSBarry Smith     ir      = 4*r[i];
782f1af5d2fSBarry Smith     x[ir]   = t[ii];
783f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
784f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
785f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
786f1af5d2fSBarry Smith     ii += 4;
787f1af5d2fSBarry Smith   }
788f1af5d2fSBarry Smith 
789f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   PetscFunctionReturn(0);
795f1af5d2fSBarry Smith }
796f1af5d2fSBarry Smith 
7974a2ae208SSatish Balay #undef __FUNCT__
7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800f1af5d2fSBarry Smith {
801f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8036849ba73SBarry Smith   PetscErrorCode ierr;
8045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   PetscFunctionBegin;
8121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814f1af5d2fSBarry Smith   t  = a->solve_work;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
820f1af5d2fSBarry Smith   ii = 0;
821f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
822f1af5d2fSBarry Smith     ic      = 5*c[i];
823f1af5d2fSBarry Smith     t[ii]   = b[ic];
824f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
825f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
826f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
827f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
828f1af5d2fSBarry Smith     ii += 5;
829f1af5d2fSBarry Smith   }
830f1af5d2fSBarry Smith 
831f1af5d2fSBarry Smith   /* forward solve the U^T */
832f1af5d2fSBarry Smith   idx = 0;
833f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
834f1af5d2fSBarry Smith 
835f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
836f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
837f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843f1af5d2fSBarry Smith     v += 25;
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
846f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       oidx = 5*(*vi++);
849f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v  += 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857f1af5d2fSBarry Smith     idx += 5;
858f1af5d2fSBarry Smith   }
859f1af5d2fSBarry Smith   /* backward solve the L^T */
860f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
861f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
862f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
863f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
864f1af5d2fSBarry Smith     idt  = 5*i;
865f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866f1af5d2fSBarry Smith     while (nz--) {
867f1af5d2fSBarry Smith       idx   = 5*(*vi--);
868f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873f1af5d2fSBarry Smith       v -= 25;
874f1af5d2fSBarry Smith     }
875f1af5d2fSBarry Smith   }
876f1af5d2fSBarry Smith 
877f1af5d2fSBarry Smith   /* copy t into x according to permutation */
878f1af5d2fSBarry Smith   ii = 0;
879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
880f1af5d2fSBarry Smith     ir      = 5*r[i];
881f1af5d2fSBarry Smith     x[ir]   = t[ii];
882f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
883f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
884f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
885f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
886f1af5d2fSBarry Smith     ii += 5;
887f1af5d2fSBarry Smith   }
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894f1af5d2fSBarry Smith   PetscFunctionReturn(0);
895f1af5d2fSBarry Smith }
896f1af5d2fSBarry Smith 
8974a2ae208SSatish Balay #undef __FUNCT__
8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900f1af5d2fSBarry Smith {
901f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9036849ba73SBarry Smith   PetscErrorCode ierr;
9045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   PetscFunctionBegin;
9121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914f1af5d2fSBarry Smith   t  = a->solve_work;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
920f1af5d2fSBarry Smith   ii = 0;
921f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
922f1af5d2fSBarry Smith     ic      = 6*c[i];
923f1af5d2fSBarry Smith     t[ii]   = b[ic];
924f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
925f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
926f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
927f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
928f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
929f1af5d2fSBarry Smith     ii += 6;
930f1af5d2fSBarry Smith   }
931f1af5d2fSBarry Smith 
932f1af5d2fSBarry Smith   /* forward solve the U^T */
933f1af5d2fSBarry Smith   idx = 0;
934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
937f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
938f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939f1af5d2fSBarry Smith     x6    = t[5+idx];
940f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946f1af5d2fSBarry Smith     v += 36;
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
949f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       oidx = 6*(*vi++);
952f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v  += 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961f1af5d2fSBarry Smith     t[5+idx] = s6;
962f1af5d2fSBarry Smith     idx += 6;
963f1af5d2fSBarry Smith   }
964f1af5d2fSBarry Smith   /* backward solve the L^T */
965f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
966f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
967f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
968f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
969f1af5d2fSBarry Smith     idt  = 6*i;
970f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971f1af5d2fSBarry Smith     s6 = t[5+idt];
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       idx   = 6*(*vi--);
974f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980f1af5d2fSBarry Smith       v -= 36;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith   }
983f1af5d2fSBarry Smith 
984f1af5d2fSBarry Smith   /* copy t into x according to permutation */
985f1af5d2fSBarry Smith   ii = 0;
986f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
987f1af5d2fSBarry Smith     ir      = 6*r[i];
988f1af5d2fSBarry Smith     x[ir]   = t[ii];
989f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
990f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
991f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
992f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
993f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
994f1af5d2fSBarry Smith     ii += 6;
995f1af5d2fSBarry Smith   }
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1003f1af5d2fSBarry Smith }
1004f1af5d2fSBarry Smith 
10054a2ae208SSatish Balay #undef __FUNCT__
10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008f1af5d2fSBarry Smith {
1009f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10116849ba73SBarry Smith   PetscErrorCode ierr;
10125d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   PetscFunctionBegin;
10201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022f1af5d2fSBarry Smith   t  = a->solve_work;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1028f1af5d2fSBarry Smith   ii = 0;
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     ic      = 7*c[i];
1031f1af5d2fSBarry Smith     t[ii]   = b[ic];
1032f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1033f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1034f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1035f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1036f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1037f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1038f1af5d2fSBarry Smith     ii += 7;
1039f1af5d2fSBarry Smith   }
1040f1af5d2fSBarry Smith 
1041f1af5d2fSBarry Smith   /* forward solve the U^T */
1042f1af5d2fSBarry Smith   idx = 0;
1043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1044f1af5d2fSBarry Smith 
1045f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1046f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1047f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1049f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056f1af5d2fSBarry Smith     v += 49;
1057f1af5d2fSBarry Smith 
1058f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1059f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1062f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v  += 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1073f1af5d2fSBarry Smith     idx += 7;
1074f1af5d2fSBarry Smith   }
1075f1af5d2fSBarry Smith   /* backward solve the L^T */
1076f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1077f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1078f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1079f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1080f1af5d2fSBarry Smith     idt  = 7*i;
1081f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1083f1af5d2fSBarry Smith     while (nz--) {
1084f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1085f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092f1af5d2fSBarry Smith       v -= 49;
1093f1af5d2fSBarry Smith     }
1094f1af5d2fSBarry Smith   }
1095f1af5d2fSBarry Smith 
1096f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1097f1af5d2fSBarry Smith   ii = 0;
1098f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1099f1af5d2fSBarry Smith     ir      = 7*r[i];
1100f1af5d2fSBarry Smith     x[ir]   = t[ii];
1101f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1102f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1103f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1104f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1105f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1106f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1107f1af5d2fSBarry Smith     ii += 7;
1108f1af5d2fSBarry Smith   }
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1116f1af5d2fSBarry Smith }
1117f1af5d2fSBarry Smith 
11184e2b4712SSatish Balay /* ----------------------------------------------------------- */
11194a2ae208SSatish Balay #undef __FUNCT__
11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11224e2b4712SSatish Balay {
11234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11256849ba73SBarry Smith   PetscErrorCode ierr;
11265d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11285d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11293f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
113087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11314e2b4712SSatish Balay 
11324e2b4712SSatish Balay   PetscFunctionBegin;
11331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135f1af5d2fSBarry Smith   t  = a->solve_work;
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11394e2b4712SSatish Balay 
11404e2b4712SSatish Balay   /* forward solve the lower triangular */
114187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11424e2b4712SSatish Balay   for (i=1; i<n; i++) {
11434e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11444e2b4712SSatish Balay     vi  = aj + ai[i];
11454e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1146f1af5d2fSBarry Smith     s = t + bs*i;
114787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
11524e2b4712SSatish Balay   }
11534e2b4712SSatish Balay   /* backward solve the upper triangular */
1154d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11564e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11574e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11584e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11604e2b4712SSatish Balay     while (nz--) {
1161f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11624e2b4712SSatish Balay       v += bs2;
11634e2b4712SSatish Balay     }
1164f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11664e2b4712SSatish Balay   }
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11734e2b4712SSatish Balay   PetscFunctionReturn(0);
11744e2b4712SSatish Balay }
11754e2b4712SSatish Balay 
11764a2ae208SSatish Balay #undef __FUNCT__
11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11794e2b4712SSatish Balay {
11804e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11814e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11826849ba73SBarry Smith   PetscErrorCode ierr;
11835d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11845d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11853f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11884e2b4712SSatish Balay 
11894e2b4712SSatish Balay   PetscFunctionBegin;
11901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192f1af5d2fSBarry Smith   t  = a->solve_work;
11934e2b4712SSatish Balay 
11944e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11954e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11964e2b4712SSatish Balay 
11974e2b4712SSatish Balay   /* forward solve the lower triangular */
11984e2b4712SSatish Balay   idx    = 7*(*r++);
1199f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1200f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12024e2b4712SSatish Balay 
12034e2b4712SSatish Balay   for (i=1; i<n; i++) {
12044e2b4712SSatish Balay     v     = aa + 49*ai[i];
12054e2b4712SSatish Balay     vi    = aj + ai[i];
12064e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12074e2b4712SSatish Balay     idx   = 7*(*r++);
1208f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12104e2b4712SSatish Balay     while (nz--) {
12114e2b4712SSatish Balay       idx   = 7*(*vi++);
1212f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1214f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1215f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12224e2b4712SSatish Balay       v += 49;
12234e2b4712SSatish Balay     }
12244e2b4712SSatish Balay     idx = 7*i;
1225f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1226f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12284e2b4712SSatish Balay   }
12294e2b4712SSatish Balay   /* backward solve the upper triangular */
12304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12314e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12324e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12334e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12344e2b4712SSatish Balay     idt  = 7*i;
1235f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1236f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12384e2b4712SSatish Balay     while (nz--) {
12394e2b4712SSatish Balay       idx   = 7*(*vi++);
1240f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1241f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1243f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12504e2b4712SSatish Balay       v += 49;
12514e2b4712SSatish Balay     }
12524e2b4712SSatish Balay     idc = 7*(*c--);
12534e2b4712SSatish Balay     v   = aa + 49*diag[i];
1254f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12684e2b4712SSatish Balay   }
12694e2b4712SSatish Balay 
12704e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12714e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12721ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12754e2b4712SSatish Balay   PetscFunctionReturn(0);
12764e2b4712SSatish Balay }
12774e2b4712SSatish Balay 
12784a2ae208SSatish Balay #undef __FUNCT__
12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
12818f690400SShri Abhyankar {
12828f690400SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
12838f690400SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
12848f690400SShri Abhyankar   PetscErrorCode ierr;
12858f690400SShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
128629b92fc1SShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
12878f690400SShri Abhyankar   MatScalar      *aa=a->a,*v;
12888f690400SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
12898f690400SShri Abhyankar   PetscScalar    *x,*b,*t;
12908f690400SShri Abhyankar 
12918f690400SShri Abhyankar   PetscFunctionBegin;
12928f690400SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12938f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
12948f690400SShri Abhyankar   t  = a->solve_work;
12958f690400SShri Abhyankar 
12968f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
129729b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
12988f690400SShri Abhyankar 
12998f690400SShri Abhyankar   /* forward solve the lower triangular */
130029b92fc1SShri Abhyankar   idx    = 7*r[0];
13018f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
13028f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
13038f690400SShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
13048f690400SShri Abhyankar 
13058f690400SShri Abhyankar   for (i=1; i<n; i++) {
13068f690400SShri Abhyankar     v     = aa + 49*ai[i];
13078f690400SShri Abhyankar     vi    = aj + ai[i];
13088f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
130929b92fc1SShri Abhyankar     idx   = 7*r[i];
13108f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
13118f690400SShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
131229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
131329b92fc1SShri Abhyankar       idx   = 7*vi[m];
13148f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
13158f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
13168f690400SShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
13178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13228f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13238f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13248f690400SShri Abhyankar       v += 49;
13258f690400SShri Abhyankar     }
13268f690400SShri Abhyankar     idx = 7*i;
13278f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
13288f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
13298f690400SShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
13308f690400SShri Abhyankar   }
13318f690400SShri Abhyankar   /* backward solve the upper triangular */
13328f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
13338f690400SShri Abhyankar     k    = 2*n-i;
13348f690400SShri Abhyankar     v    = aa + 49*ai[k];
13358f690400SShri Abhyankar     vi   = aj + ai[k];
13368f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
13378f690400SShri Abhyankar     idt  = 7*i;
13388f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
13398f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
13408f690400SShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
134129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
134229b92fc1SShri Abhyankar       idx   = 7*vi[m];
13438f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
13448f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
13458f690400SShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
13468f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13478f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13488f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13498f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13508f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13518f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13528f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13538f690400SShri Abhyankar       v += 49;
13548f690400SShri Abhyankar     }
135529b92fc1SShri Abhyankar     idc = 7*c[i];
13568f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
13578f690400SShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
13588f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
13598f690400SShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
13608f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
13618f690400SShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
13628f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
13638f690400SShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
13648f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
13658f690400SShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
13668f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
13678f690400SShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
13688f690400SShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
13698f690400SShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13708f690400SShri Abhyankar   }
13718f690400SShri Abhyankar 
13728f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
13738f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13748f690400SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13758f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
13768f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
13778f690400SShri Abhyankar   PetscFunctionReturn(0);
13788f690400SShri Abhyankar }
13798f690400SShri Abhyankar 
13808f690400SShri Abhyankar #undef __FUNCT__
13814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
138315091d37SBarry Smith {
138415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1385690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1386dfbe8321SBarry Smith   PetscErrorCode    ierr;
1387690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1388d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1389d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1390d9fead3dSBarry Smith   const PetscScalar *b;
139115091d37SBarry Smith 
139215091d37SBarry Smith   PetscFunctionBegin;
1393d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
139515091d37SBarry Smith   /* forward solve the lower triangular */
139615091d37SBarry Smith   idx    = 0;
139715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
139815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
139915091d37SBarry Smith   x[6] = b[6+idx];
140015091d37SBarry Smith   for (i=1; i<n; i++) {
140115091d37SBarry Smith     v     =  aa + 49*ai[i];
140215091d37SBarry Smith     vi    =  aj + ai[i];
140315091d37SBarry Smith     nz    =  diag[i] - ai[i];
140415091d37SBarry Smith     idx   =  7*i;
1405f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1406f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1407f1af5d2fSBarry Smith     s7  =  b[6+idx];
140815091d37SBarry Smith     while (nz--) {
140915091d37SBarry Smith       jdx   = 7*(*vi++);
141015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
141115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
141215091d37SBarry Smith       x7    = x[6+jdx];
1413f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1414f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1415f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1416f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1417f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1418f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1419f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
142015091d37SBarry Smith       v += 49;
142115091d37SBarry Smith      }
1422f1af5d2fSBarry Smith     x[idx]   = s1;
1423f1af5d2fSBarry Smith     x[1+idx] = s2;
1424f1af5d2fSBarry Smith     x[2+idx] = s3;
1425f1af5d2fSBarry Smith     x[3+idx] = s4;
1426f1af5d2fSBarry Smith     x[4+idx] = s5;
1427f1af5d2fSBarry Smith     x[5+idx] = s6;
1428f1af5d2fSBarry Smith     x[6+idx] = s7;
142915091d37SBarry Smith   }
143015091d37SBarry Smith   /* backward solve the upper triangular */
143115091d37SBarry Smith   for (i=n-1; i>=0; i--){
143215091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
143315091d37SBarry Smith     vi   = aj + diag[i] + 1;
143415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
143515091d37SBarry Smith     idt  = 7*i;
1436f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1437f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1438f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1439f1af5d2fSBarry Smith     s7 = x[6+idt];
144015091d37SBarry Smith     while (nz--) {
144115091d37SBarry Smith       idx   = 7*(*vi++);
144215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
144315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
144415091d37SBarry Smith       x7    = x[6+idx];
1445f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1446f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1447f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1448f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1449f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1450f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1451f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
145215091d37SBarry Smith       v += 49;
145315091d37SBarry Smith     }
145415091d37SBarry Smith     v        = aa + 49*diag[i];
1455f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1456f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1457f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1458f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1459f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1460f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1461f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1462f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1463f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1464f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1465f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1466f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1467f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1468f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
146915091d37SBarry Smith   }
147015091d37SBarry Smith 
1471d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
147415091d37SBarry Smith   PetscFunctionReturn(0);
147515091d37SBarry Smith }
147615091d37SBarry Smith 
14774a2ae208SSatish Balay #undef __FUNCT__
1478cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1479cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1480cee9d6f2SShri Abhyankar {
1481cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
14826464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1483cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1484cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1485cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1486cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1487cee9d6f2SShri Abhyankar     PetscScalar       *x;
1488cee9d6f2SShri Abhyankar     const PetscScalar *b;
1489cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1490cee9d6f2SShri Abhyankar 
1491cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1492cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1493cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1494cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1495cee9d6f2SShri Abhyankar     idx    = 0;
1496cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1497cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1498cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1499cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1500cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1501cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1502cee9d6f2SShri Abhyankar       idx   = bs*i;
1503cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1504cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
15056464896eSShri Abhyankar        for(k=0;k<nz;k++) {
15066464896eSShri Abhyankar           jdx   = bs*vi[k];
1507cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1508cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1509cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1510cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1511cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1512cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1513cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1514cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1515cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1516cee9d6f2SShri Abhyankar           v   +=  bs2;
1517cee9d6f2SShri Abhyankar         }
1518cee9d6f2SShri Abhyankar 
1519cee9d6f2SShri Abhyankar        x[idx]   = s1;
1520cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1521cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1522cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1523cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1524cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1525cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1526cee9d6f2SShri Abhyankar     }
1527cee9d6f2SShri Abhyankar 
1528cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1529cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1530cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1531cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1532cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1533cee9d6f2SShri Abhyankar      idt = bs*i;
1534cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1535cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
15366464896eSShri Abhyankar     for(k=0;k<nz;k++) {
15376464896eSShri Abhyankar       idx   = bs*vi[k];
1538cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1539cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1540cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1541cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1542cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1543cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1544cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1545cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1546cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1547cee9d6f2SShri Abhyankar         v   +=  bs2;
1548cee9d6f2SShri Abhyankar     }
1549cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1550cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1551cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1552cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1553cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1554cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1555cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1556cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1557cee9d6f2SShri Abhyankar   }
1558cee9d6f2SShri Abhyankar 
1559cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1560cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1561cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1562cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1563cee9d6f2SShri Abhyankar }
1564cee9d6f2SShri Abhyankar 
1565cee9d6f2SShri Abhyankar #undef __FUNCT__
15664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1567dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
156815091d37SBarry Smith {
156915091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
157015091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
15716849ba73SBarry Smith   PetscErrorCode    ierr;
15725d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
15735d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1574d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1575d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1576d9fead3dSBarry Smith   const PetscScalar *b;
157715091d37SBarry Smith   PetscFunctionBegin;
1578d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1580f1af5d2fSBarry Smith   t  = a->solve_work;
158115091d37SBarry Smith 
158215091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
158315091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
158415091d37SBarry Smith 
158515091d37SBarry Smith   /* forward solve the lower triangular */
158615091d37SBarry Smith   idx    = 6*(*r++);
1587f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1588f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1589f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
159015091d37SBarry Smith   for (i=1; i<n; i++) {
159115091d37SBarry Smith     v     = aa + 36*ai[i];
159215091d37SBarry Smith     vi    = aj + ai[i];
159315091d37SBarry Smith     nz    = diag[i] - ai[i];
159415091d37SBarry Smith     idx   = 6*(*r++);
1595f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1596f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
159715091d37SBarry Smith     while (nz--) {
159815091d37SBarry Smith       idx   = 6*(*vi++);
1599f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1600f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1601f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1602f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1603f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1604f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1605f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1606f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
160715091d37SBarry Smith       v += 36;
160815091d37SBarry Smith     }
160915091d37SBarry Smith     idx = 6*i;
1610f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1611f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1612f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
161315091d37SBarry Smith   }
161415091d37SBarry Smith   /* backward solve the upper triangular */
161515091d37SBarry Smith   for (i=n-1; i>=0; i--){
161615091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
161715091d37SBarry Smith     vi   = aj + diag[i] + 1;
161815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
161915091d37SBarry Smith     idt  = 6*i;
1620f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1621f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1622f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
162315091d37SBarry Smith     while (nz--) {
162415091d37SBarry Smith       idx   = 6*(*vi++);
1625f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1626f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1627f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1628f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1629f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1630f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1631f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1632f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1633f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
163415091d37SBarry Smith       v += 36;
163515091d37SBarry Smith     }
163615091d37SBarry Smith     idc = 6*(*c--);
163715091d37SBarry Smith     v   = aa + 36*diag[i];
1638f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1639f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1640f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1641f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1642f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1643f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1644f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1645f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1646f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1647f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1648f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1649f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
165015091d37SBarry Smith   }
165115091d37SBarry Smith 
165215091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
165315091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1654d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1656dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
165715091d37SBarry Smith   PetscFunctionReturn(0);
165815091d37SBarry Smith }
165915091d37SBarry Smith 
16604a2ae208SSatish Balay #undef __FUNCT__
16618f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
16628f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
16638f690400SShri Abhyankar {
16648f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
16658f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
16668f690400SShri Abhyankar   PetscErrorCode    ierr;
16678f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
166829b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
16698f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
16708f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
16718f690400SShri Abhyankar   const PetscScalar *b;
16728f690400SShri Abhyankar   PetscFunctionBegin;
16738f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16748f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
16758f690400SShri Abhyankar   t  = a->solve_work;
16768f690400SShri Abhyankar 
16778f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
167829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
16798f690400SShri Abhyankar 
16808f690400SShri Abhyankar   /* forward solve the lower triangular */
168129b92fc1SShri Abhyankar   idx    = 6*r[0];
16828f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
16838f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
16848f690400SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
16858f690400SShri Abhyankar   for (i=1; i<n; i++) {
16868f690400SShri Abhyankar     v     = aa + 36*ai[i];
16878f690400SShri Abhyankar     vi    = aj + ai[i];
16888f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
168929b92fc1SShri Abhyankar     idx   = 6*r[i];
16908f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
16918f690400SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
169229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
169329b92fc1SShri Abhyankar       idx   = 6*vi[m];
16948f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
16958f690400SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
16968f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
16978f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
16988f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
16998f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
17008f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
17018f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
17028f690400SShri Abhyankar       v += 36;
17038f690400SShri Abhyankar     }
17048f690400SShri Abhyankar     idx = 6*i;
17058f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
17068f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
17078f690400SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
17088f690400SShri Abhyankar   }
17098f690400SShri Abhyankar   /* backward solve the upper triangular */
17108f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
17118f690400SShri Abhyankar     k    = 2*n-i;
17128f690400SShri Abhyankar     v    = aa + 36*ai[k];
17138f690400SShri Abhyankar     vi   = aj + ai[k];
17148f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
17158f690400SShri Abhyankar     idt  = 6*i;
17168f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
17178f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
17188f690400SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
171929b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
172029b92fc1SShri Abhyankar       idx   = 6*vi[m];
17218f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
17228f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
17238f690400SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
17248f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
17258f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
17268f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
17278f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
17288f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
17298f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
17308f690400SShri Abhyankar       v += 36;
17318f690400SShri Abhyankar     }
173229b92fc1SShri Abhyankar     idc = 6*c[i];
17338f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
17348f690400SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
17358f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
17368f690400SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
17378f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
17388f690400SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
17398f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
17408f690400SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
17418f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
17428f690400SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
17438f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
17448f690400SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
17458f690400SShri Abhyankar   }
17468f690400SShri Abhyankar 
17478f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17488f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
17498f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17508f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
17518f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
17528f690400SShri Abhyankar   PetscFunctionReturn(0);
17538f690400SShri Abhyankar }
17548f690400SShri Abhyankar 
17558f690400SShri Abhyankar 
17568f690400SShri Abhyankar #undef __FUNCT__
17574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1758dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
175915091d37SBarry Smith {
176015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1761690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1762dfbe8321SBarry Smith   PetscErrorCode    ierr;
1763690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1764d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1765d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1766d9fead3dSBarry Smith   const PetscScalar *b;
176715091d37SBarry Smith 
176815091d37SBarry Smith   PetscFunctionBegin;
1769d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
177115091d37SBarry Smith   /* forward solve the lower triangular */
177215091d37SBarry Smith   idx    = 0;
177315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
177415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
177515091d37SBarry Smith   for (i=1; i<n; i++) {
177615091d37SBarry Smith     v     =  aa + 36*ai[i];
177715091d37SBarry Smith     vi    =  aj + ai[i];
177815091d37SBarry Smith     nz    =  diag[i] - ai[i];
177915091d37SBarry Smith     idx   =  6*i;
1780f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1781f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
178215091d37SBarry Smith     while (nz--) {
178315091d37SBarry Smith       jdx   = 6*(*vi++);
178415091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
178515091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1786f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1787f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1788f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1789f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1790f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1791f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
179215091d37SBarry Smith       v += 36;
179315091d37SBarry Smith      }
1794f1af5d2fSBarry Smith     x[idx]   = s1;
1795f1af5d2fSBarry Smith     x[1+idx] = s2;
1796f1af5d2fSBarry Smith     x[2+idx] = s3;
1797f1af5d2fSBarry Smith     x[3+idx] = s4;
1798f1af5d2fSBarry Smith     x[4+idx] = s5;
1799f1af5d2fSBarry Smith     x[5+idx] = s6;
180015091d37SBarry Smith   }
180115091d37SBarry Smith   /* backward solve the upper triangular */
180215091d37SBarry Smith   for (i=n-1; i>=0; i--){
180315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
180415091d37SBarry Smith     vi   = aj + diag[i] + 1;
180515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
180615091d37SBarry Smith     idt  = 6*i;
1807f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1808f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1809f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
181015091d37SBarry Smith     while (nz--) {
181115091d37SBarry Smith       idx   = 6*(*vi++);
181215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
181315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1814f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1815f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1816f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1817f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1818f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1819f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
182015091d37SBarry Smith       v += 36;
182115091d37SBarry Smith     }
182215091d37SBarry Smith     v        = aa + 36*diag[i];
1823f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1824f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1825f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1826f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1827f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1828f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
182915091d37SBarry Smith   }
183015091d37SBarry Smith 
1831d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18321ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1833dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
183415091d37SBarry Smith   PetscFunctionReturn(0);
183515091d37SBarry Smith }
183615091d37SBarry Smith 
18374a2ae208SSatish Balay #undef __FUNCT__
1838cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1839cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1840cee9d6f2SShri Abhyankar {
1841cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
18426464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1843cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1844cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1845cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1846cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1847cee9d6f2SShri Abhyankar     PetscScalar       *x;
1848cee9d6f2SShri Abhyankar     const PetscScalar *b;
1849cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1850cee9d6f2SShri Abhyankar 
1851cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1852cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1853cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1854cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1855cee9d6f2SShri Abhyankar     idx    = 0;
1856cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1857cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
1858cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1859cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1860cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1861cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1862cee9d6f2SShri Abhyankar       idx   = bs*i;
1863cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1864cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
18656464896eSShri Abhyankar        for(k=0;k<nz;k++){
18666464896eSShri Abhyankar           jdx   = bs*vi[k];
1867cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1868cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1869cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1870cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1871cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1872cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1873cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1874cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1875cee9d6f2SShri Abhyankar           v   +=  bs2;
1876cee9d6f2SShri Abhyankar         }
1877cee9d6f2SShri Abhyankar 
1878cee9d6f2SShri Abhyankar        x[idx]   = s1;
1879cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1880cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1881cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1882cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1883cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1884cee9d6f2SShri Abhyankar     }
1885cee9d6f2SShri Abhyankar 
1886cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1887cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1888cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1889cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1890cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1891cee9d6f2SShri Abhyankar      idt = bs*i;
1892cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1893cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
18946464896eSShri Abhyankar      for(k=0;k<nz;k++){
18956464896eSShri Abhyankar       idx   = bs*vi[k];
1896cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1897cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
1898cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1899cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1900cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1901cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1902cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1903cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1904cee9d6f2SShri Abhyankar         v   +=  bs2;
1905cee9d6f2SShri Abhyankar     }
1906cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1907cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1908cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1909cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1910cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1911cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1912cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1913cee9d6f2SShri Abhyankar   }
1914cee9d6f2SShri Abhyankar 
1915cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1916cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1917cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1918cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1919cee9d6f2SShri Abhyankar }
19208f690400SShri Abhyankar 
1921cee9d6f2SShri Abhyankar #undef __FUNCT__
19224a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1923dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
19244e2b4712SSatish Balay {
19254e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
19264e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
19276849ba73SBarry Smith   PetscErrorCode    ierr;
19285d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
19295d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1930d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1931d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1932d9fead3dSBarry Smith   const PetscScalar *b;
19334e2b4712SSatish Balay 
19344e2b4712SSatish Balay   PetscFunctionBegin;
1935d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19361ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1937f1af5d2fSBarry Smith   t  = a->solve_work;
19384e2b4712SSatish Balay 
19394e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
19404e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
19414e2b4712SSatish Balay 
19424e2b4712SSatish Balay   /* forward solve the lower triangular */
19434e2b4712SSatish Balay   idx    = 5*(*r++);
1944f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1945f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
19464e2b4712SSatish Balay   for (i=1; i<n; i++) {
19474e2b4712SSatish Balay     v     = aa + 25*ai[i];
19484e2b4712SSatish Balay     vi    = aj + ai[i];
19494e2b4712SSatish Balay     nz    = diag[i] - ai[i];
19504e2b4712SSatish Balay     idx   = 5*(*r++);
1951f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1952f1af5d2fSBarry Smith     s5  = b[4+idx];
19534e2b4712SSatish Balay     while (nz--) {
19544e2b4712SSatish Balay       idx   = 5*(*vi++);
1955f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1956f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1957f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1958f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1959f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1960f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1961f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
19624e2b4712SSatish Balay       v += 25;
19634e2b4712SSatish Balay     }
19644e2b4712SSatish Balay     idx = 5*i;
1965f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1966f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
19674e2b4712SSatish Balay   }
19684e2b4712SSatish Balay   /* backward solve the upper triangular */
19694e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
19704e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
19714e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
19724e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
19734e2b4712SSatish Balay     idt  = 5*i;
1974f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1975f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
19764e2b4712SSatish Balay     while (nz--) {
19774e2b4712SSatish Balay       idx   = 5*(*vi++);
1978f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1979f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1980f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1981f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1982f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1983f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1984f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
19854e2b4712SSatish Balay       v += 25;
19864e2b4712SSatish Balay     }
19874e2b4712SSatish Balay     idc = 5*(*c--);
19884e2b4712SSatish Balay     v   = aa + 25*diag[i];
1989f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1990f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1991f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1992f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1993f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1994f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1995f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1996f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1997f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1998f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
19994e2b4712SSatish Balay   }
20004e2b4712SSatish Balay 
20014e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
20024e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2003d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2005dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
20064e2b4712SSatish Balay   PetscFunctionReturn(0);
20074e2b4712SSatish Balay }
20084e2b4712SSatish Balay 
20094a2ae208SSatish Balay #undef __FUNCT__
20108f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
20118f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
20128f690400SShri Abhyankar {
20138f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
20148f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
20158f690400SShri Abhyankar   PetscErrorCode    ierr;
20168f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
201729b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
20188f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
20198f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
20208f690400SShri Abhyankar   const PetscScalar *b;
20218f690400SShri Abhyankar 
20228f690400SShri Abhyankar   PetscFunctionBegin;
20238f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20248f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
20258f690400SShri Abhyankar   t  = a->solve_work;
20268f690400SShri Abhyankar 
20278f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
202829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
20298f690400SShri Abhyankar 
20308f690400SShri Abhyankar   /* forward solve the lower triangular */
203129b92fc1SShri Abhyankar   idx    = 5*r[0];
20328f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
20338f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
20348f690400SShri Abhyankar   for (i=1; i<n; i++) {
20358f690400SShri Abhyankar     v     = aa + 25*ai[i];
20368f690400SShri Abhyankar     vi    = aj + ai[i];
20378f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
203829b92fc1SShri Abhyankar     idx   = 5*r[i];
20398f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
20408f690400SShri Abhyankar     s5  = b[4+idx];
204129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
204229b92fc1SShri Abhyankar       idx   = 5*vi[m];
20438f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
20448f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
20458f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
20468f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
20478f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
20488f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
20498f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
20508f690400SShri Abhyankar       v += 25;
20518f690400SShri Abhyankar     }
20528f690400SShri Abhyankar     idx = 5*i;
20538f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
20548f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
20558f690400SShri Abhyankar   }
20568f690400SShri Abhyankar   /* backward solve the upper triangular */
20578f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
20588f690400SShri Abhyankar     k    = 2*n-i;
20598f690400SShri Abhyankar     v    = aa + 25*ai[k];
20608f690400SShri Abhyankar     vi   = aj + ai[k];
20618f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
20628f690400SShri Abhyankar     idt  = 5*i;
20638f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
20648f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
206529b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
206629b92fc1SShri Abhyankar       idx   = 5*vi[m];
20678f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
20688f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
20698f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
20708f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
20718f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
20728f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
20738f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
20748f690400SShri Abhyankar       v += 25;
20758f690400SShri Abhyankar     }
207629b92fc1SShri Abhyankar     idc = 5*c[i];
20778f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
20788f690400SShri Abhyankar                                  v[15]*s4+v[20]*s5;
20798f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
20808f690400SShri Abhyankar                                  v[16]*s4+v[21]*s5;
20818f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
20828f690400SShri Abhyankar                                  v[17]*s4+v[22]*s5;
20838f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
20848f690400SShri Abhyankar                                  v[18]*s4+v[23]*s5;
20858f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
20868f690400SShri Abhyankar                                  v[19]*s4+v[24]*s5;
20878f690400SShri Abhyankar   }
20888f690400SShri Abhyankar 
20898f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
20908f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20918f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20928f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
20938f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
20948f690400SShri Abhyankar   PetscFunctionReturn(0);
20958f690400SShri Abhyankar }
20968f690400SShri Abhyankar #undef __FUNCT__
20974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2098dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
209915091d37SBarry Smith {
210015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2101690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2102dfbe8321SBarry Smith   PetscErrorCode    ierr;
2103690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2104d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2105d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2106d9fead3dSBarry Smith   const PetscScalar *b;
210715091d37SBarry Smith 
210815091d37SBarry Smith   PetscFunctionBegin;
2109d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
211115091d37SBarry Smith   /* forward solve the lower triangular */
211215091d37SBarry Smith   idx    = 0;
211315091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
211415091d37SBarry Smith   for (i=1; i<n; i++) {
211515091d37SBarry Smith     v     =  aa + 25*ai[i];
211615091d37SBarry Smith     vi    =  aj + ai[i];
211715091d37SBarry Smith     nz    =  diag[i] - ai[i];
211815091d37SBarry Smith     idx   =  5*i;
2119f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
212015091d37SBarry Smith     while (nz--) {
212115091d37SBarry Smith       jdx   = 5*(*vi++);
212215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2123f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2124f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2125f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2126f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2127f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
212815091d37SBarry Smith       v    += 25;
212915091d37SBarry Smith     }
2130f1af5d2fSBarry Smith     x[idx]   = s1;
2131f1af5d2fSBarry Smith     x[1+idx] = s2;
2132f1af5d2fSBarry Smith     x[2+idx] = s3;
2133f1af5d2fSBarry Smith     x[3+idx] = s4;
2134f1af5d2fSBarry Smith     x[4+idx] = s5;
213515091d37SBarry Smith   }
213615091d37SBarry Smith   /* backward solve the upper triangular */
213715091d37SBarry Smith   for (i=n-1; i>=0; i--){
213815091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
213915091d37SBarry Smith     vi   = aj + diag[i] + 1;
214015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
214115091d37SBarry Smith     idt  = 5*i;
2142f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2143f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
214415091d37SBarry Smith     while (nz--) {
214515091d37SBarry Smith       idx   = 5*(*vi++);
214615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2147f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2148f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2149f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2150f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2151f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
215215091d37SBarry Smith       v    += 25;
215315091d37SBarry Smith     }
215415091d37SBarry Smith     v        = aa + 25*diag[i];
2155f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2156f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2157f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2158f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2159f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
216015091d37SBarry Smith   }
216115091d37SBarry Smith 
2162d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2164dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
216515091d37SBarry Smith   PetscFunctionReturn(0);
216615091d37SBarry Smith }
216715091d37SBarry Smith 
21684a2ae208SSatish Balay #undef __FUNCT__
2169cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2170cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2171cee9d6f2SShri Abhyankar {
2172cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
21736464896eSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2174cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
2175cee9d6f2SShri Abhyankar   PetscInt          jdx;
2176cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2177cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2178cee9d6f2SShri Abhyankar   const PetscScalar *b;
2179cee9d6f2SShri Abhyankar 
2180cee9d6f2SShri Abhyankar   PetscFunctionBegin;
2181cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2182cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2183cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
2184cee9d6f2SShri Abhyankar   idx    = 0;
2185cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2186cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
2187cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
2188cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
2189cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
2190cee9d6f2SShri Abhyankar     idx = 5*i;
2191cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
21926464896eSShri Abhyankar     for(k=0;k<nz;k++) {
21936464896eSShri Abhyankar       jdx   = 5*vi[k];
2194cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2195cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2196cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2197cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2198cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2199cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2200cee9d6f2SShri Abhyankar       v    += 25;
2201cee9d6f2SShri Abhyankar     }
2202cee9d6f2SShri Abhyankar     x[idx]   = s1;
2203cee9d6f2SShri Abhyankar     x[1+idx] = s2;
2204cee9d6f2SShri Abhyankar     x[2+idx] = s3;
2205cee9d6f2SShri Abhyankar     x[3+idx] = s4;
2206cee9d6f2SShri Abhyankar     x[4+idx] = s5;
2207cee9d6f2SShri Abhyankar   }
2208cee9d6f2SShri Abhyankar 
2209cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
2210cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2211cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
2212cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
2213cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2214cee9d6f2SShri Abhyankar     idt = 5*i;
2215cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
2216cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
22176464896eSShri Abhyankar     for(k=0;k<nz;k++){
22186464896eSShri Abhyankar       idx   = 5*vi[k];
2219cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2220cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2221cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2222cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2223cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2224cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2225cee9d6f2SShri Abhyankar       v    += 25;
2226cee9d6f2SShri Abhyankar     }
2227cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2228cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2229cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2230cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2231cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2232cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2233cee9d6f2SShri Abhyankar   }
2234cee9d6f2SShri Abhyankar 
2235cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2236cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2237cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2238cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2239cee9d6f2SShri Abhyankar }
2240cee9d6f2SShri Abhyankar 
2241cee9d6f2SShri Abhyankar #undef __FUNCT__
22424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2243dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
22444e2b4712SSatish Balay {
22454e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
22464e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
22476849ba73SBarry Smith   PetscErrorCode    ierr;
22485d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
22495d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2250d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2251d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2252d9fead3dSBarry Smith   const PetscScalar *b;
22534e2b4712SSatish Balay 
22544e2b4712SSatish Balay   PetscFunctionBegin;
2255d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22561ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2257f1af5d2fSBarry Smith   t  = a->solve_work;
22584e2b4712SSatish Balay 
22594e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22604e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
22614e2b4712SSatish Balay 
22624e2b4712SSatish Balay   /* forward solve the lower triangular */
22634e2b4712SSatish Balay   idx    = 4*(*r++);
2264f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2265f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
22664e2b4712SSatish Balay   for (i=1; i<n; i++) {
22674e2b4712SSatish Balay     v     = aa + 16*ai[i];
22684e2b4712SSatish Balay     vi    = aj + ai[i];
22694e2b4712SSatish Balay     nz    = diag[i] - ai[i];
22704e2b4712SSatish Balay     idx   = 4*(*r++);
2271f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
22724e2b4712SSatish Balay     while (nz--) {
22734e2b4712SSatish Balay       idx   = 4*(*vi++);
2274f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2275f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2276f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2277f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2278f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
22794e2b4712SSatish Balay       v    += 16;
22804e2b4712SSatish Balay     }
22814e2b4712SSatish Balay     idx        = 4*i;
2282f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2283f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
22844e2b4712SSatish Balay   }
22854e2b4712SSatish Balay   /* backward solve the upper triangular */
22864e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
22874e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
22884e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
22894e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
22904e2b4712SSatish Balay     idt  = 4*i;
2291f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2292f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
22934e2b4712SSatish Balay     while (nz--) {
22944e2b4712SSatish Balay       idx   = 4*(*vi++);
2295f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2296f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2297f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2298f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2299f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2300f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
23014e2b4712SSatish Balay       v += 16;
23024e2b4712SSatish Balay     }
23034e2b4712SSatish Balay     idc      = 4*(*c--);
23044e2b4712SSatish Balay     v        = aa + 16*diag[i];
2305f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2306f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2307f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2308f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
23094e2b4712SSatish Balay   }
23104e2b4712SSatish Balay 
23114e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23124e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2313d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2315dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
23164e2b4712SSatish Balay   PetscFunctionReturn(0);
23174e2b4712SSatish Balay }
2318f26ec98cSKris Buschelman 
2319f26ec98cSKris Buschelman #undef __FUNCT__
23208f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
23218f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
23228f690400SShri Abhyankar {
23238f690400SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
23248f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
23258f690400SShri Abhyankar   PetscErrorCode    ierr;
232629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
23278f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
23288f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
23298f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
23308f690400SShri Abhyankar   const PetscScalar *b;
23318f690400SShri Abhyankar 
23328f690400SShri Abhyankar   PetscFunctionBegin;
23338f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23348f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23358f690400SShri Abhyankar   t  = a->solve_work;
23368f690400SShri Abhyankar 
23378f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
233829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
23398f690400SShri Abhyankar 
23408f690400SShri Abhyankar   /* forward solve the lower triangular */
234129b92fc1SShri Abhyankar   idx    = 4*r[0];
23428f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
23438f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
23448f690400SShri Abhyankar   for (i=1; i<n; i++) {
23458f690400SShri Abhyankar     v     = aa + 16*ai[i];
23468f690400SShri Abhyankar     vi    = aj + ai[i];
23478f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
234829b92fc1SShri Abhyankar     idx   = 4*r[i];
23498f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
235029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
235129b92fc1SShri Abhyankar       idx   = 4*vi[m];
23528f690400SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
23538f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
23548f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
23558f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
23568f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
23578f690400SShri Abhyankar       v    += 16;
23588f690400SShri Abhyankar     }
23598f690400SShri Abhyankar     idx        = 4*i;
23608f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
23618f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
23628f690400SShri Abhyankar   }
23638f690400SShri Abhyankar   /* backward solve the upper triangular */
23648f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
23658f690400SShri Abhyankar     k    = 2*n-i;
23668f690400SShri Abhyankar     v    = aa + 16*ai[k];
23678f690400SShri Abhyankar     vi   = aj + ai[k];
23688f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
23698f690400SShri Abhyankar     idt  = 4*i;
23708f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
23718f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
237229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
237329b92fc1SShri Abhyankar       idx   = 4*vi[m];
23748f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
23758f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
23768f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
23778f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
23788f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
23798f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
23808f690400SShri Abhyankar       v += 16;
23818f690400SShri Abhyankar     }
238229b92fc1SShri Abhyankar     idc      = 4*c[i];
23838f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
23848f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
23858f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
23868f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
23878f690400SShri Abhyankar   }
23888f690400SShri Abhyankar 
23898f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23908f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23918f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23928f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23938f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
23948f690400SShri Abhyankar   PetscFunctionReturn(0);
23958f690400SShri Abhyankar }
23968f690400SShri Abhyankar 
23978f690400SShri Abhyankar #undef __FUNCT__
2398f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2399dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2400f26ec98cSKris Buschelman {
2401f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2402f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
24036849ba73SBarry Smith   PetscErrorCode    ierr;
24045d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
24055d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2406d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2407d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2408d9fead3dSBarry Smith   PetscScalar       *x;
2409d9fead3dSBarry Smith   const PetscScalar *b;
2410f26ec98cSKris Buschelman 
2411f26ec98cSKris Buschelman   PetscFunctionBegin;
2412d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2414f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2415f26ec98cSKris Buschelman 
2416f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2417f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2418f26ec98cSKris Buschelman 
2419f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2420f26ec98cSKris Buschelman   idx    = 4*(*r++);
2421f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2422f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2423f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2424f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2425f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2426f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2427f26ec98cSKris Buschelman     vi    = aj + ai[i];
2428f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2429f26ec98cSKris Buschelman     idx   = 4*(*r++);
2430f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2431f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2432f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2433f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2434f26ec98cSKris Buschelman     while (nz--) {
2435f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2436f26ec98cSKris Buschelman       x1  = t[idx];
2437f26ec98cSKris Buschelman       x2  = t[1+idx];
2438f26ec98cSKris Buschelman       x3  = t[2+idx];
2439f26ec98cSKris Buschelman       x4  = t[3+idx];
2440f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2441f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2442f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2443f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2444f26ec98cSKris Buschelman       v    += 16;
2445f26ec98cSKris Buschelman     }
2446f26ec98cSKris Buschelman     idx        = 4*i;
2447f26ec98cSKris Buschelman     t[idx]   = s1;
2448f26ec98cSKris Buschelman     t[1+idx] = s2;
2449f26ec98cSKris Buschelman     t[2+idx] = s3;
2450f26ec98cSKris Buschelman     t[3+idx] = s4;
2451f26ec98cSKris Buschelman   }
2452f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2453f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2454f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2455f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2456f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2457f26ec98cSKris Buschelman     idt  = 4*i;
2458f26ec98cSKris Buschelman     s1 = t[idt];
2459f26ec98cSKris Buschelman     s2 = t[1+idt];
2460f26ec98cSKris Buschelman     s3 = t[2+idt];
2461f26ec98cSKris Buschelman     s4 = t[3+idt];
2462f26ec98cSKris Buschelman     while (nz--) {
2463f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2464f26ec98cSKris Buschelman       x1  = t[idx];
2465f26ec98cSKris Buschelman       x2  = t[1+idx];
2466f26ec98cSKris Buschelman       x3  = t[2+idx];
2467f26ec98cSKris Buschelman       x4  = t[3+idx];
2468f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2469f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2470f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2471f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2472f26ec98cSKris Buschelman       v += 16;
2473f26ec98cSKris Buschelman     }
2474f26ec98cSKris Buschelman     idc      = 4*(*c--);
2475f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
2476f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2477f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2478f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2479f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2480f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2481f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2482f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2483f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2484f26ec98cSKris Buschelman  }
2485f26ec98cSKris Buschelman 
2486f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2487f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2488d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2490dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2491f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2492f26ec98cSKris Buschelman }
2493f26ec98cSKris Buschelman 
249424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
249524c233c2SKris Buschelman 
249624c233c2SKris Buschelman #include PETSC_HAVE_SSE
249724c233c2SKris Buschelman 
249824c233c2SKris Buschelman #undef __FUNCT__
249924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2500dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
250124c233c2SKris Buschelman {
250224c233c2SKris Buschelman   /*
250324c233c2SKris Buschelman      Note: This code uses demotion of double
250424c233c2SKris Buschelman      to float when performing the mixed-mode computation.
250524c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
250624c233c2SKris Buschelman   */
250724c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
250824c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
25096849ba73SBarry Smith   PetscErrorCode ierr;
25105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
25115d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
251224c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
251387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
251424c233c2SKris Buschelman 
251524c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
251624c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
251724c233c2SKris Buschelman   unsigned long   offset;
251824c233c2SKris Buschelman 
251924c233c2SKris Buschelman   PetscFunctionBegin;
252024c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
252124c233c2SKris Buschelman 
252224c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
252324c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
252424c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
252524c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
252624c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
252724c233c2SKris Buschelman 
25281ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25291ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
253024c233c2SKris Buschelman     t  = a->solve_work;
253124c233c2SKris Buschelman 
253224c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
253324c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
253424c233c2SKris Buschelman 
253524c233c2SKris Buschelman     /* forward solve the lower triangular */
253624c233c2SKris Buschelman     idx  = 4*(*r++);
253724c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
253824c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
253924c233c2SKris Buschelman     v    =  aa + 16*ai[1];
254024c233c2SKris Buschelman 
254124c233c2SKris Buschelman     for (i=1; i<n;) {
254224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
254324c233c2SKris Buschelman       vi   =  aj      + ai[i];
254424c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
254524c233c2SKris Buschelman       idx  =  4*(*r++);
254624c233c2SKris Buschelman 
254724c233c2SKris Buschelman       /* Demote sum from double to float */
254824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
254924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
255024c233c2SKris Buschelman 
255124c233c2SKris Buschelman       while (nz--) {
255224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
255324c233c2SKris Buschelman         idx = 4*(*vi++);
255424c233c2SKris Buschelman 
255524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
255624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
255724c233c2SKris Buschelman 
255824c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
255924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
256024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
256124c233c2SKris Buschelman 
256224c233c2SKris Buschelman           /* First Column */
256324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
256424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
256524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
256624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
256724c233c2SKris Buschelman 
256824c233c2SKris Buschelman           /* Second Column */
256924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
257024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
257124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
257224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
257324c233c2SKris Buschelman 
257424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
257524c233c2SKris Buschelman 
257624c233c2SKris Buschelman           /* Third Column */
257724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
257824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
257924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
258024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
258124c233c2SKris Buschelman 
258224c233c2SKris Buschelman           /* Fourth Column */
258324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
258424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
258524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
258624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
258724c233c2SKris Buschelman         SSE_INLINE_END_2
258824c233c2SKris Buschelman 
258924c233c2SKris Buschelman         v  += 16;
259024c233c2SKris Buschelman       }
259124c233c2SKris Buschelman       idx = 4*i;
259224c233c2SKris Buschelman       v   = aa + 16*ai[++i];
259324c233c2SKris Buschelman       PREFETCH_NTA(v);
259424c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
259524c233c2SKris Buschelman 
259624c233c2SKris Buschelman       /* Promote result from float to double */
259724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
259824c233c2SKris Buschelman     }
259924c233c2SKris Buschelman     /* backward solve the upper triangular */
260024c233c2SKris Buschelman     idt  = 4*(n-1);
260124c233c2SKris Buschelman     ai16 = 16*diag[n-1];
260224c233c2SKris Buschelman     v    = aa + ai16 + 16;
260324c233c2SKris Buschelman     for (i=n-1; i>=0;){
260424c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
260524c233c2SKris Buschelman       vi = aj + diag[i] + 1;
260624c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
260724c233c2SKris Buschelman 
260824c233c2SKris Buschelman       /* Demote accumulator from double to float */
260924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
261024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
261124c233c2SKris Buschelman 
261224c233c2SKris Buschelman       while (nz--) {
261324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
261424c233c2SKris Buschelman         idx = 4*(*vi++);
261524c233c2SKris Buschelman 
261624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
261724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
261824c233c2SKris Buschelman 
261924c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
262024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
262124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
262224c233c2SKris Buschelman 
262324c233c2SKris Buschelman           /* First Column */
262424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
262524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
262624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
262724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
262824c233c2SKris Buschelman 
262924c233c2SKris Buschelman           /* Second Column */
263024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
263124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
263224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
263324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
263424c233c2SKris Buschelman 
263524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
263624c233c2SKris Buschelman 
263724c233c2SKris Buschelman           /* Third Column */
263824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
263924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
264024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
264124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
264224c233c2SKris Buschelman 
264324c233c2SKris Buschelman           /* Fourth Column */
264424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
264524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
264624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
264724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
264824c233c2SKris Buschelman         SSE_INLINE_END_2
264924c233c2SKris Buschelman         v  += 16;
265024c233c2SKris Buschelman       }
265124c233c2SKris Buschelman       v    = aa + ai16;
265224c233c2SKris Buschelman       ai16 = 16*diag[--i];
265324c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
265424c233c2SKris Buschelman       /*
265524c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
265624c233c2SKris Buschelman          which was inverted as part of the factorization
265724c233c2SKris Buschelman       */
265824c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
265924c233c2SKris Buschelman         /* First Column */
266024c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
266124c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
266224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
266324c233c2SKris Buschelman 
266424c233c2SKris Buschelman         /* Second Column */
266524c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
266624c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
266724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
266824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
266924c233c2SKris Buschelman 
267024c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
267124c233c2SKris Buschelman 
267224c233c2SKris Buschelman         /* Third Column */
267324c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
267424c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
267524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
267624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
267724c233c2SKris Buschelman 
267824c233c2SKris Buschelman         /* Fourth Column */
267924c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
268024c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
268124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
268224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
268324c233c2SKris Buschelman 
268424c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
268524c233c2SKris Buschelman       SSE_INLINE_END_3
268624c233c2SKris Buschelman 
268724c233c2SKris Buschelman       /* Promote solution from float to double */
268824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
268924c233c2SKris Buschelman 
269024c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
269124c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
269224c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
269324c233c2SKris Buschelman       idc  = 4*(*c--);
269424c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
269524c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
269624c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
269724c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
269824c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
269924c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
270024c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
270124c233c2SKris Buschelman       SSE_INLINE_END_2
270224c233c2SKris Buschelman       v    = aa + ai16 + 16;
270324c233c2SKris Buschelman       idt -= 4;
270424c233c2SKris Buschelman     }
270524c233c2SKris Buschelman 
270624c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
270724c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27081ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
27091ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2710dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
271124c233c2SKris Buschelman   SSE_SCOPE_END;
271224c233c2SKris Buschelman   PetscFunctionReturn(0);
271324c233c2SKris Buschelman }
271424c233c2SKris Buschelman 
271524c233c2SKris Buschelman #endif
27160ef38995SBarry Smith 
27170ef38995SBarry Smith 
27184e2b4712SSatish Balay /*
27194e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
27204e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
27214e2b4712SSatish Balay */
27224a2ae208SSatish Balay #undef __FUNCT__
27234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2724dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
27254e2b4712SSatish Balay {
27264e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2727356650c2SBarry Smith   PetscInt          n=a->mbs;
2728356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2729dfbe8321SBarry Smith   PetscErrorCode    ierr;
2730356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2731d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2732d9fead3dSBarry Smith   PetscScalar       *x;
2733d9fead3dSBarry Smith   const PetscScalar *b;
27344e2b4712SSatish Balay 
27354e2b4712SSatish Balay   PetscFunctionBegin;
2736d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27384e2b4712SSatish Balay 
2739aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
27402853dc0eSBarry Smith   {
274187828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
27422853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
27432853dc0eSBarry Smith   }
2744aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
27452853dc0eSBarry Smith   {
274687828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
27472853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
27482853dc0eSBarry Smith   }
2749aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
27502853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2751e1293385SBarry Smith #else
275230d4dcafSBarry Smith   {
275387828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2754d9fead3dSBarry Smith     const MatScalar *v;
2755356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2756356650c2SBarry Smith     const PetscInt  *vi;
2757e1293385SBarry Smith 
27584e2b4712SSatish Balay   /* forward solve the lower triangular */
27594e2b4712SSatish Balay   idx    = 0;
2760e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
27614e2b4712SSatish Balay   for (i=1; i<n; i++) {
27624e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
27634e2b4712SSatish Balay     vi    =  aj      + ai[i];
27644e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2765e1293385SBarry Smith     idx   +=  4;
2766f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
27674e2b4712SSatish Balay     while (nz--) {
27684e2b4712SSatish Balay       jdx   = 4*(*vi++);
27694e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2770f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2771f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2772f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2773f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
27744e2b4712SSatish Balay       v    += 16;
27754e2b4712SSatish Balay     }
2776f1af5d2fSBarry Smith     x[idx]   = s1;
2777f1af5d2fSBarry Smith     x[1+idx] = s2;
2778f1af5d2fSBarry Smith     x[2+idx] = s3;
2779f1af5d2fSBarry Smith     x[3+idx] = s4;
27804e2b4712SSatish Balay   }
27814e2b4712SSatish Balay   /* backward solve the upper triangular */
27824e555682SBarry Smith   idt = 4*(n-1);
27834e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
27844e555682SBarry Smith     ai16 = 16*diag[i];
27854e555682SBarry Smith     v    = aa + ai16 + 16;
27864e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
27874e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2788f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2789f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
27904e2b4712SSatish Balay     while (nz--) {
27914e2b4712SSatish Balay       idx   = 4*(*vi++);
27924e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2793f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2794f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2795f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2796f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
27974e2b4712SSatish Balay       v    += 16;
27984e2b4712SSatish Balay     }
27994e555682SBarry Smith     v        = aa + ai16;
2800f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2801f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2802f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2803f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2804329f5518SBarry Smith     idt -= 4;
28054e2b4712SSatish Balay   }
280630d4dcafSBarry Smith   }
2807e1293385SBarry Smith #endif
28084e2b4712SSatish Balay 
2809d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2811dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
28124e2b4712SSatish Balay   PetscFunctionReturn(0);
28134e2b4712SSatish Balay }
28144e2b4712SSatish Balay 
2815f26ec98cSKris Buschelman #undef __FUNCT__
2816cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2817cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2818cee9d6f2SShri Abhyankar {
2819cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
28206464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2821cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
2822cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
2823cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2824cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2825cee9d6f2SShri Abhyankar     PetscScalar       *x;
2826cee9d6f2SShri Abhyankar     const PetscScalar *b;
2827cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2828cee9d6f2SShri Abhyankar 
2829cee9d6f2SShri Abhyankar     PetscFunctionBegin;
2830cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2831cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2832cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
2833cee9d6f2SShri Abhyankar     idx    = 0;
2834cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2835cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
2836cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
2837cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
2838cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
2839cee9d6f2SShri Abhyankar       idx   = bs*i;
2840cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
28416464896eSShri Abhyankar       for(k=0;k<nz;k++) {
28426464896eSShri Abhyankar           jdx   = bs*vi[k];
2843cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2844cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2845cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2846cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2847cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2848cee9d6f2SShri Abhyankar 
2849cee9d6f2SShri Abhyankar           v   +=  bs2;
2850cee9d6f2SShri Abhyankar         }
2851cee9d6f2SShri Abhyankar 
2852cee9d6f2SShri Abhyankar        x[idx]   = s1;
2853cee9d6f2SShri Abhyankar        x[1+idx] = s2;
2854cee9d6f2SShri Abhyankar        x[2+idx] = s3;
2855cee9d6f2SShri Abhyankar        x[3+idx] = s4;
2856cee9d6f2SShri Abhyankar     }
2857cee9d6f2SShri Abhyankar 
2858cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
2859cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2860cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
2861cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
2862cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2863cee9d6f2SShri Abhyankar      idt = bs*i;
2864cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2865cee9d6f2SShri Abhyankar 
28666464896eSShri Abhyankar     for(k=0;k<nz;k++){
28676464896eSShri Abhyankar       idx   = bs*vi[k];
2868cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2869cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2870cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2871cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2872cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2873cee9d6f2SShri Abhyankar 
2874cee9d6f2SShri Abhyankar         v   +=  bs2;
2875cee9d6f2SShri Abhyankar     }
2876cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2877cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2878cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2879cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2880cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2881cee9d6f2SShri Abhyankar 
2882cee9d6f2SShri Abhyankar   }
2883cee9d6f2SShri Abhyankar 
2884cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2885cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2886cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2887cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2888cee9d6f2SShri Abhyankar }
2889cee9d6f2SShri Abhyankar 
2890*b2b2dd24SShri Abhyankar #undef __FUNCT__
2891*b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
2892*b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2893*b2b2dd24SShri Abhyankar {
2894*b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2895*b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2896*b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
2897*b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
2898*b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2899*b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2900*b2b2dd24SShri Abhyankar     PetscScalar       *x;
2901*b2b2dd24SShri Abhyankar     const PetscScalar *b;
2902*b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2903cee9d6f2SShri Abhyankar 
2904*b2b2dd24SShri Abhyankar     PetscFunctionBegin;
2905*b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2906*b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2907*b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
2908*b2b2dd24SShri Abhyankar     idx    = 0;
2909*b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2910*b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
2911*b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
2912*b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
2913*b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
2914*b2b2dd24SShri Abhyankar       idx   = bs*i;
2915*b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2916*b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
2917*b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
2918*b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2919*b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2920*b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2921*b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2922*b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2923*b2b2dd24SShri Abhyankar 
2924*b2b2dd24SShri Abhyankar           v   +=  bs2;
2925*b2b2dd24SShri Abhyankar         }
2926*b2b2dd24SShri Abhyankar 
2927*b2b2dd24SShri Abhyankar        x[idx]   = s1;
2928*b2b2dd24SShri Abhyankar        x[1+idx] = s2;
2929*b2b2dd24SShri Abhyankar        x[2+idx] = s3;
2930*b2b2dd24SShri Abhyankar        x[3+idx] = s4;
2931*b2b2dd24SShri Abhyankar     }
2932*b2b2dd24SShri Abhyankar 
2933*b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
2934*b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
2935*b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
2936*b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
2937*b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
2938*b2b2dd24SShri Abhyankar      idt = bs*i;
2939*b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2940*b2b2dd24SShri Abhyankar 
2941*b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
2942*b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
2943*b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2944*b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2945*b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2946*b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2947*b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2948*b2b2dd24SShri Abhyankar 
2949*b2b2dd24SShri Abhyankar         v   +=  bs2;
2950*b2b2dd24SShri Abhyankar     }
2951*b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
2952*b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2953*b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2954*b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2955*b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2956*b2b2dd24SShri Abhyankar 
2957*b2b2dd24SShri Abhyankar   }
2958*b2b2dd24SShri Abhyankar 
2959*b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2960*b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2961*b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2962*b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
2963*b2b2dd24SShri Abhyankar }
2964cee9d6f2SShri Abhyankar 
2965cee9d6f2SShri Abhyankar #undef __FUNCT__
2966f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2967dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2968f26ec98cSKris Buschelman {
2969f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2970690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2971dfbe8321SBarry Smith   PetscErrorCode ierr;
2972690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2973f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2974f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2975f26ec98cSKris Buschelman 
2976f26ec98cSKris Buschelman   PetscFunctionBegin;
29771ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
29781ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2979f26ec98cSKris Buschelman 
2980f26ec98cSKris Buschelman   {
2981f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2982f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2983690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2984f26ec98cSKris Buschelman 
2985f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2986f26ec98cSKris Buschelman     idx  = 0;
2987f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2988f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2989f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2990f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2991f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2992f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2993f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2994f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2995f26ec98cSKris Buschelman       idx   +=  4;
2996f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2997f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2998f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2999f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3000f26ec98cSKris Buschelman       while (nz--) {
3001f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3002f26ec98cSKris Buschelman         x1  = t[jdx];
3003f26ec98cSKris Buschelman         x2  = t[1+jdx];
3004f26ec98cSKris Buschelman         x3  = t[2+jdx];
3005f26ec98cSKris Buschelman         x4  = t[3+jdx];
3006f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3007f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3008f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3009f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3010f26ec98cSKris Buschelman         v    += 16;
3011f26ec98cSKris Buschelman       }
3012f26ec98cSKris Buschelman       t[idx]   = s1;
3013f26ec98cSKris Buschelman       t[1+idx] = s2;
3014f26ec98cSKris Buschelman       t[2+idx] = s3;
3015f26ec98cSKris Buschelman       t[3+idx] = s4;
3016f26ec98cSKris Buschelman     }
3017f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3018f26ec98cSKris Buschelman     idt = 4*(n-1);
3019f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3020f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3021f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3022f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3023f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3024f26ec98cSKris Buschelman       s1   = t[idt];
3025f26ec98cSKris Buschelman       s2   = t[1+idt];
3026f26ec98cSKris Buschelman       s3   = t[2+idt];
3027f26ec98cSKris Buschelman       s4   = t[3+idt];
3028f26ec98cSKris Buschelman       while (nz--) {
3029f26ec98cSKris Buschelman         idx = 4*(*vi++);
3030f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3031f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3032f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3033f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3034f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3035f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3036f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3037f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3038f26ec98cSKris Buschelman         v    += 16;
3039f26ec98cSKris Buschelman       }
3040f26ec98cSKris Buschelman       v        = aa + ai16;
3041f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3042f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3043f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3044f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3045f26ec98cSKris Buschelman       idt -= 4;
3046f26ec98cSKris Buschelman     }
3047f26ec98cSKris Buschelman   }
3048f26ec98cSKris Buschelman 
30491ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3051dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3052f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3053f26ec98cSKris Buschelman }
3054f26ec98cSKris Buschelman 
30553660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
30563660e330SKris Buschelman 
30573660e330SKris Buschelman #include PETSC_HAVE_SSE
30583660e330SKris Buschelman #undef __FUNCT__
30597cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3060dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
30613660e330SKris Buschelman {
30623660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
30632aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3064dfbe8321SBarry Smith   PetscErrorCode ierr;
3065dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
30663660e330SKris Buschelman   MatScalar      *aa=a->a;
306787828ca2SBarry Smith   PetscScalar    *x,*b;
30683660e330SKris Buschelman 
30693660e330SKris Buschelman   PetscFunctionBegin;
30703660e330SKris Buschelman   SSE_SCOPE_BEGIN;
30713660e330SKris Buschelman   /*
30723660e330SKris Buschelman      Note: This code currently uses demotion of double
30733660e330SKris Buschelman      to float when performing the mixed-mode computation.
30743660e330SKris Buschelman      This may not be numerically reasonable for all applications.
30753660e330SKris Buschelman   */
30763660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
30773660e330SKris Buschelman 
30781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
30803660e330SKris Buschelman   {
3081eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3082eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
30832aa5897fSKris Buschelman     int            nz,i,idt,ai16;
30842aa5897fSKris Buschelman     unsigned int   jdx,idx;
30852aa5897fSKris Buschelman     unsigned short *vi;
3086eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
30873660e330SKris Buschelman 
3088eb05f457SKris Buschelman     /* First block is the identity. */
30893660e330SKris Buschelman     idx  = 0;
3090eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
30912aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
30923660e330SKris Buschelman 
30933660e330SKris Buschelman     for (i=1; i<n;) {
30943660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
30953660e330SKris Buschelman       vi   =  aj      + ai[i];
30963660e330SKris Buschelman       nz   =  diag[i] - ai[i];
30973660e330SKris Buschelman       idx +=  4;
30983660e330SKris Buschelman 
3099eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3100eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3101eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
31023660e330SKris Buschelman 
31033660e330SKris Buschelman       while (nz--) {
31043660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
31052aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
31063660e330SKris Buschelman 
31073660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3108eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
31093660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
31103660e330SKris Buschelman 
31113660e330SKris Buschelman           /* First Column */
31123660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
31133660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
31143660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
31153660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
31163660e330SKris Buschelman 
31173660e330SKris Buschelman           /* Second Column */
31183660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
31193660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
31203660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
31213660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
31223660e330SKris Buschelman 
31233660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
31243660e330SKris Buschelman 
31253660e330SKris Buschelman           /* Third Column */
31263660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
31273660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
31283660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
31293660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
31303660e330SKris Buschelman 
31313660e330SKris Buschelman           /* Fourth Column */
31323660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
31333660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
31343660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
31353660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
31363660e330SKris Buschelman         SSE_INLINE_END_2
31373660e330SKris Buschelman 
31383660e330SKris Buschelman         v  += 16;
31393660e330SKris Buschelman       }
31403660e330SKris Buschelman       v    =  aa + 16*ai[++i];
31413660e330SKris Buschelman       PREFETCH_NTA(v);
3142eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
31433660e330SKris Buschelman     }
3144eb05f457SKris Buschelman 
3145eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3146eb05f457SKris Buschelman 
31473660e330SKris Buschelman     idt  = 4*(n-1);
31483660e330SKris Buschelman     ai16 = 16*diag[n-1];
31493660e330SKris Buschelman     v    = aa + ai16 + 16;
31503660e330SKris Buschelman     for (i=n-1; i>=0;){
31513660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
31523660e330SKris Buschelman       vi = aj + diag[i] + 1;
31533660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
31543660e330SKris Buschelman 
3155eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
31563660e330SKris Buschelman 
31573660e330SKris Buschelman       while (nz--) {
31583660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
31592aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
31603660e330SKris Buschelman 
31613660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3162eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
31633660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
31643660e330SKris Buschelman 
31653660e330SKris Buschelman           /* First Column */
31663660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
31673660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
31683660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
31693660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
31703660e330SKris Buschelman 
31713660e330SKris Buschelman           /* Second Column */
31723660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
31733660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
31743660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
31753660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
31763660e330SKris Buschelman 
31773660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
31783660e330SKris Buschelman 
31793660e330SKris Buschelman           /* Third Column */
31803660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
31813660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
31823660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
31833660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
31843660e330SKris Buschelman 
31853660e330SKris Buschelman           /* Fourth Column */
31863660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
31873660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
31883660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
31893660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
31903660e330SKris Buschelman         SSE_INLINE_END_2
31913660e330SKris Buschelman         v  += 16;
31923660e330SKris Buschelman       }
31933660e330SKris Buschelman       v    = aa + ai16;
31943660e330SKris Buschelman       ai16 = 16*diag[--i];
31953660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
31963660e330SKris Buschelman       /*
31973660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
31983660e330SKris Buschelman          which was inverted as part of the factorization
31993660e330SKris Buschelman       */
3200eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
32013660e330SKris Buschelman         /* First Column */
32023660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
32033660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
32043660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
32053660e330SKris Buschelman 
32063660e330SKris Buschelman         /* Second Column */
32073660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
32083660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
32093660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
32103660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
32113660e330SKris Buschelman 
32123660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
32133660e330SKris Buschelman 
32143660e330SKris Buschelman         /* Third Column */
32153660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
32163660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
32173660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
32183660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
32193660e330SKris Buschelman 
32203660e330SKris Buschelman         /* Fourth Column */
32213660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
32223660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
32233660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
32243660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
32253660e330SKris Buschelman 
32263660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
32273660e330SKris Buschelman       SSE_INLINE_END_3
32283660e330SKris Buschelman 
32293660e330SKris Buschelman       v    = aa + ai16 + 16;
32303660e330SKris Buschelman       idt -= 4;
32313660e330SKris Buschelman     }
3232eb05f457SKris Buschelman 
3233eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3234eb05f457SKris Buschelman     idt = 4*(n-1);
3235eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3236eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3237eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3238eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3239eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3240eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3241eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3242eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3243eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
324454693613SKris Buschelman       idt -= 4;
32453660e330SKris Buschelman     }
3246eb05f457SKris Buschelman 
3247eb05f457SKris Buschelman   } /* End of artificial scope. */
32481ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
32491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3250dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
32513660e330SKris Buschelman   SSE_SCOPE_END;
32523660e330SKris Buschelman   PetscFunctionReturn(0);
32533660e330SKris Buschelman }
32543660e330SKris Buschelman 
32557cf1b8d3SKris Buschelman #undef __FUNCT__
32567cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3257dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
32587cf1b8d3SKris Buschelman {
32597cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
32607cf1b8d3SKris Buschelman   int            *aj=a->j;
3261dfbe8321SBarry Smith   PetscErrorCode ierr;
3262dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
32637cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
32647cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
32657cf1b8d3SKris Buschelman 
32667cf1b8d3SKris Buschelman   PetscFunctionBegin;
32677cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
32687cf1b8d3SKris Buschelman   /*
32697cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
32707cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
32717cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
32727cf1b8d3SKris Buschelman   */
32737cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
32747cf1b8d3SKris Buschelman 
32751ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
32761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
32777cf1b8d3SKris Buschelman   {
32787cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
32797cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
32807cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
32817cf1b8d3SKris Buschelman     int       jdx,idx;
32827cf1b8d3SKris Buschelman     int       *vi;
32837cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
32847cf1b8d3SKris Buschelman 
32857cf1b8d3SKris Buschelman     /* First block is the identity. */
32867cf1b8d3SKris Buschelman     idx  = 0;
32877cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
32887cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
32897cf1b8d3SKris Buschelman 
32907cf1b8d3SKris Buschelman     for (i=1; i<n;) {
32917cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
32927cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
32937cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
32947cf1b8d3SKris Buschelman       idx +=  4;
32957cf1b8d3SKris Buschelman 
32967cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
32977cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
32987cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
32997cf1b8d3SKris Buschelman 
33007cf1b8d3SKris Buschelman       while (nz--) {
33017cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
33027cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
33037cf1b8d3SKris Buschelman /*          jdx = *vi++; */
33047cf1b8d3SKris Buschelman 
33057cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
33067cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
33077cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
33087cf1b8d3SKris Buschelman 
33097cf1b8d3SKris Buschelman           /* First Column */
33107cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
33117cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
33127cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
33137cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
33147cf1b8d3SKris Buschelman 
33157cf1b8d3SKris Buschelman           /* Second Column */
33167cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
33177cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
33187cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
33197cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
33207cf1b8d3SKris Buschelman 
33217cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
33227cf1b8d3SKris Buschelman 
33237cf1b8d3SKris Buschelman           /* Third Column */
33247cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
33257cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
33267cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
33277cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
33287cf1b8d3SKris Buschelman 
33297cf1b8d3SKris Buschelman           /* Fourth Column */
33307cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
33317cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
33327cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
33337cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
33347cf1b8d3SKris Buschelman         SSE_INLINE_END_2
33357cf1b8d3SKris Buschelman 
33367cf1b8d3SKris Buschelman         v  += 16;
33377cf1b8d3SKris Buschelman       }
33387cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
33397cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
33407cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
33417cf1b8d3SKris Buschelman     }
33427cf1b8d3SKris Buschelman 
33437cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
33447cf1b8d3SKris Buschelman 
33457cf1b8d3SKris Buschelman     idt  = 4*(n-1);
33467cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
33477cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
33487cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
33497cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
33507cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
33517cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
33527cf1b8d3SKris Buschelman 
33537cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
33547cf1b8d3SKris Buschelman 
33557cf1b8d3SKris Buschelman       while (nz--) {
33567cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
33577cf1b8d3SKris Buschelman         idx = 4*(*vi++);
33587cf1b8d3SKris Buschelman /*          idx = *vi++; */
33597cf1b8d3SKris Buschelman 
33607cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
33617cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
33627cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
33637cf1b8d3SKris Buschelman 
33647cf1b8d3SKris Buschelman           /* First Column */
33657cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
33667cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
33677cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
33687cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
33697cf1b8d3SKris Buschelman 
33707cf1b8d3SKris Buschelman           /* Second Column */
33717cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
33727cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
33737cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
33747cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
33757cf1b8d3SKris Buschelman 
33767cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
33777cf1b8d3SKris Buschelman 
33787cf1b8d3SKris Buschelman           /* Third Column */
33797cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
33807cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
33817cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
33827cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
33837cf1b8d3SKris Buschelman 
33847cf1b8d3SKris Buschelman           /* Fourth Column */
33857cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
33867cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
33877cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
33887cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
33897cf1b8d3SKris Buschelman         SSE_INLINE_END_2
33907cf1b8d3SKris Buschelman         v  += 16;
33917cf1b8d3SKris Buschelman       }
33927cf1b8d3SKris Buschelman       v    = aa + ai16;
33937cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
33947cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
33957cf1b8d3SKris Buschelman       /*
33967cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
33977cf1b8d3SKris Buschelman          which was inverted as part of the factorization
33987cf1b8d3SKris Buschelman       */
33997cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
34007cf1b8d3SKris Buschelman         /* First Column */
34017cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
34027cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
34037cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
34047cf1b8d3SKris Buschelman 
34057cf1b8d3SKris Buschelman         /* Second Column */
34067cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
34077cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
34087cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
34097cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
34107cf1b8d3SKris Buschelman 
34117cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
34127cf1b8d3SKris Buschelman 
34137cf1b8d3SKris Buschelman         /* Third Column */
34147cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
34157cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
34167cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
34177cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
34187cf1b8d3SKris Buschelman 
34197cf1b8d3SKris Buschelman         /* Fourth Column */
34207cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
34217cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
34227cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
34237cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
34247cf1b8d3SKris Buschelman 
34257cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
34267cf1b8d3SKris Buschelman       SSE_INLINE_END_3
34277cf1b8d3SKris Buschelman 
34287cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
34297cf1b8d3SKris Buschelman       idt -= 4;
34307cf1b8d3SKris Buschelman     }
34317cf1b8d3SKris Buschelman 
34327cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
34337cf1b8d3SKris Buschelman     idt = 4*(n-1);
34347cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
34357cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
34367cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
34377cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
34387cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
34397cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
34407cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
34417cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
34427cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
34437cf1b8d3SKris Buschelman       idt -= 4;
34447cf1b8d3SKris Buschelman     }
34457cf1b8d3SKris Buschelman 
34467cf1b8d3SKris Buschelman   } /* End of artificial scope. */
34471ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
34481ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3449dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
34507cf1b8d3SKris Buschelman   SSE_SCOPE_END;
34517cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
34527cf1b8d3SKris Buschelman }
34537cf1b8d3SKris Buschelman 
34543660e330SKris Buschelman #endif
34558f690400SShri Abhyankar 
34564a2ae208SSatish Balay #undef __FUNCT__
34574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3458dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
34594e2b4712SSatish Balay {
34604e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
34614e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
34626849ba73SBarry Smith   PetscErrorCode    ierr;
34635d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
34645d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3465d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3466d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3467d9fead3dSBarry Smith   const PetscScalar *b;
34684e2b4712SSatish Balay 
34694e2b4712SSatish Balay   PetscFunctionBegin;
3470d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
34711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3472f1af5d2fSBarry Smith   t  = a->solve_work;
34734e2b4712SSatish Balay 
34744e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
34754e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
34764e2b4712SSatish Balay 
34774e2b4712SSatish Balay   /* forward solve the lower triangular */
34784e2b4712SSatish Balay   idx    = 3*(*r++);
3479f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
34804e2b4712SSatish Balay   for (i=1; i<n; i++) {
34814e2b4712SSatish Balay     v     = aa + 9*ai[i];
34824e2b4712SSatish Balay     vi    = aj + ai[i];
34834e2b4712SSatish Balay     nz    = diag[i] - ai[i];
34844e2b4712SSatish Balay     idx   = 3*(*r++);
3485f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
34864e2b4712SSatish Balay     while (nz--) {
34874e2b4712SSatish Balay       idx   = 3*(*vi++);
3488f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3489f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3490f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3491f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
34924e2b4712SSatish Balay       v += 9;
34934e2b4712SSatish Balay     }
34944e2b4712SSatish Balay     idx = 3*i;
3495f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
34964e2b4712SSatish Balay   }
34974e2b4712SSatish Balay   /* backward solve the upper triangular */
34984e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
34994e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
35004e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
35014e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
35024e2b4712SSatish Balay     idt  = 3*i;
3503f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
35044e2b4712SSatish Balay     while (nz--) {
35054e2b4712SSatish Balay       idx   = 3*(*vi++);
3506f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3507f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3508f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3509f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
35104e2b4712SSatish Balay       v += 9;
35114e2b4712SSatish Balay     }
35124e2b4712SSatish Balay     idc = 3*(*c--);
35134e2b4712SSatish Balay     v   = aa + 9*diag[i];
3514f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3515f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3516f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
35174e2b4712SSatish Balay   }
35184e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
35194e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3520d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35211ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3522dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
35234e2b4712SSatish Balay   PetscFunctionReturn(0);
35244e2b4712SSatish Balay }
35254e2b4712SSatish Balay 
35268f690400SShri Abhyankar #undef __FUNCT__
35278f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
35288f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
35298f690400SShri Abhyankar {
35308f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
35318f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
35328f690400SShri Abhyankar   PetscErrorCode    ierr;
353329b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
35348f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
35358f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
35368f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
35378f690400SShri Abhyankar   const PetscScalar *b;
35388f690400SShri Abhyankar 
35398f690400SShri Abhyankar   PetscFunctionBegin;
35408f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35418f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
35428f690400SShri Abhyankar   t  = a->solve_work;
35438f690400SShri Abhyankar 
35448f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
354529b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
35468f690400SShri Abhyankar 
35478f690400SShri Abhyankar   /* forward solve the lower triangular */
354829b92fc1SShri Abhyankar   idx    = 3*r[0];
35498f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
35508f690400SShri Abhyankar   for (i=1; i<n; i++) {
35518f690400SShri Abhyankar     v     = aa + 9*ai[i];
35528f690400SShri Abhyankar     vi    = aj + ai[i];
35538f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
355429b92fc1SShri Abhyankar     idx   = 3*r[i];
35558f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
355629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
355729b92fc1SShri Abhyankar       idx   = 3*vi[m];
35588f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
35598f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
35608f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
35618f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
35628f690400SShri Abhyankar       v += 9;
35638f690400SShri Abhyankar     }
35648f690400SShri Abhyankar     idx = 3*i;
35658f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
35668f690400SShri Abhyankar   }
35678f690400SShri Abhyankar   /* backward solve the upper triangular */
35688f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
35698f690400SShri Abhyankar     k    = 2*n-i;
35708f690400SShri Abhyankar     v    = aa + 9*ai[k];
35718f690400SShri Abhyankar     vi   = aj + ai[k];
35728f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
35738f690400SShri Abhyankar     idt  = 3*i;
35748f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
357529b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
357629b92fc1SShri Abhyankar       idx   = 3*vi[m];
35778f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
35788f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
35798f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
35808f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
35818f690400SShri Abhyankar       v += 9;
35828f690400SShri Abhyankar     }
358329b92fc1SShri Abhyankar     idc = 3*c[i];
35848f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
35858f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
35868f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
35878f690400SShri Abhyankar   }
35888f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
35898f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
35908f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35918f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
35928f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
35938f690400SShri Abhyankar   PetscFunctionReturn(0);
35948f690400SShri Abhyankar }
35958f690400SShri Abhyankar 
359615091d37SBarry Smith /*
359715091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
359815091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
359915091d37SBarry Smith */
36004a2ae208SSatish Balay #undef __FUNCT__
36014a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3602dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
360315091d37SBarry Smith {
360415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3605690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3606dfbe8321SBarry Smith   PetscErrorCode    ierr;
3607690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3608d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3609d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3610d9fead3dSBarry Smith   const PetscScalar *b;
3611690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
361215091d37SBarry Smith 
361315091d37SBarry Smith   PetscFunctionBegin;
3614d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
36151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
361615091d37SBarry Smith 
361715091d37SBarry Smith   /* forward solve the lower triangular */
361815091d37SBarry Smith   idx    = 0;
361915091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
362015091d37SBarry Smith   for (i=1; i<n; i++) {
362115091d37SBarry Smith     v     =  aa      + 9*ai[i];
362215091d37SBarry Smith     vi    =  aj      + ai[i];
362315091d37SBarry Smith     nz    =  diag[i] - ai[i];
362415091d37SBarry Smith     idx   +=  3;
3625f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
362615091d37SBarry Smith     while (nz--) {
362715091d37SBarry Smith       jdx   = 3*(*vi++);
362815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3629f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3630f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3631f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
363215091d37SBarry Smith       v    += 9;
363315091d37SBarry Smith     }
3634f1af5d2fSBarry Smith     x[idx]   = s1;
3635f1af5d2fSBarry Smith     x[1+idx] = s2;
3636f1af5d2fSBarry Smith     x[2+idx] = s3;
363715091d37SBarry Smith   }
363815091d37SBarry Smith   /* backward solve the upper triangular */
363915091d37SBarry Smith   for (i=n-1; i>=0; i--){
364015091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
364115091d37SBarry Smith     vi   = aj + diag[i] + 1;
364215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
364315091d37SBarry Smith     idt  = 3*i;
3644f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3645f1af5d2fSBarry Smith     s3 = x[2+idt];
364615091d37SBarry Smith     while (nz--) {
364715091d37SBarry Smith       idx   = 3*(*vi++);
364815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3649f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3650f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3651f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
365215091d37SBarry Smith       v    += 9;
365315091d37SBarry Smith     }
365415091d37SBarry Smith     v        = aa +  9*diag[i];
3655f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3656f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3657f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
365815091d37SBarry Smith   }
365915091d37SBarry Smith 
3660d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
36611ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3662dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
366315091d37SBarry Smith   PetscFunctionReturn(0);
366415091d37SBarry Smith }
366515091d37SBarry Smith 
36664a2ae208SSatish Balay #undef __FUNCT__
3667cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3668cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3669cee9d6f2SShri Abhyankar {
3670cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3671ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3672cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3673cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3674cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3675cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3676cee9d6f2SShri Abhyankar     PetscScalar       *x;
3677cee9d6f2SShri Abhyankar     const PetscScalar *b;
3678cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
3679cee9d6f2SShri Abhyankar 
3680cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3681cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3682cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3683cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3684cee9d6f2SShri Abhyankar     idx    = 0;
3685cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3686cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3687cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3688cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3689cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3690cee9d6f2SShri Abhyankar       idx   = bs*i;
3691cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3692ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
3693ce3d78c0SShri Abhyankar          jdx   = bs*vi[k];
3694cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3695cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3696cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3697cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3698cee9d6f2SShri Abhyankar 
3699cee9d6f2SShri Abhyankar           v   +=  bs2;
3700cee9d6f2SShri Abhyankar         }
3701cee9d6f2SShri Abhyankar 
3702cee9d6f2SShri Abhyankar        x[idx]   = s1;
3703cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3704cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3705cee9d6f2SShri Abhyankar     }
3706cee9d6f2SShri Abhyankar 
3707cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3708cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3709cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3710cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3711cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3712cee9d6f2SShri Abhyankar      idt = bs*i;
3713cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3714cee9d6f2SShri Abhyankar 
3715ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
3716ce3d78c0SShri Abhyankar        idx   = bs*vi[k];
3717cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3718cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3719cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3720cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3721cee9d6f2SShri Abhyankar 
3722cee9d6f2SShri Abhyankar         v   +=  bs2;
3723cee9d6f2SShri Abhyankar     }
3724cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3725cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3726cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3727cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3728cee9d6f2SShri Abhyankar 
3729cee9d6f2SShri Abhyankar   }
3730cee9d6f2SShri Abhyankar 
3731cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3732cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3733cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3734cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3735cee9d6f2SShri Abhyankar }
3736cee9d6f2SShri Abhyankar 
3737cee9d6f2SShri Abhyankar #undef __FUNCT__
3738*b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
3739*b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3740*b2b2dd24SShri Abhyankar {
3741*b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3742*b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3743*b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3744*b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3745*b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3746*b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3747*b2b2dd24SShri Abhyankar     PetscScalar       *x;
3748*b2b2dd24SShri Abhyankar     const PetscScalar *b;
3749*b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
3750*b2b2dd24SShri Abhyankar 
3751*b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3752*b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3753*b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3754*b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3755*b2b2dd24SShri Abhyankar     idx    = 0;
3756*b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3757*b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3758*b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3759*b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3760*b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3761*b2b2dd24SShri Abhyankar       idx   = bs*i;
3762*b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3763*b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
3764*b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
3765*b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3766*b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3767*b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3768*b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3769*b2b2dd24SShri Abhyankar 
3770*b2b2dd24SShri Abhyankar           v   +=  bs2;
3771*b2b2dd24SShri Abhyankar         }
3772*b2b2dd24SShri Abhyankar 
3773*b2b2dd24SShri Abhyankar        x[idx]   = s1;
3774*b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3775*b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3776*b2b2dd24SShri Abhyankar     }
3777*b2b2dd24SShri Abhyankar 
3778*b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3779*b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3780*b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3781*b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3782*b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3783*b2b2dd24SShri Abhyankar      idt = bs*i;
3784*b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3785*b2b2dd24SShri Abhyankar 
3786*b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
3787*b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
3788*b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3789*b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3790*b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3791*b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3792*b2b2dd24SShri Abhyankar 
3793*b2b2dd24SShri Abhyankar         v   +=  bs2;
3794*b2b2dd24SShri Abhyankar     }
3795*b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3796*b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3797*b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3798*b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3799*b2b2dd24SShri Abhyankar 
3800*b2b2dd24SShri Abhyankar   }
3801*b2b2dd24SShri Abhyankar 
3802*b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3803*b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3804*b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3805*b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3806*b2b2dd24SShri Abhyankar }
3807*b2b2dd24SShri Abhyankar 
3808*b2b2dd24SShri Abhyankar #undef __FUNCT__
38094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3810dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
38114e2b4712SSatish Balay {
38124e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
38134e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
38146849ba73SBarry Smith   PetscErrorCode    ierr;
38155d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
38165d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3817d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3818d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
3819d9fead3dSBarry Smith   const PetscScalar *b;
38204e2b4712SSatish Balay 
38214e2b4712SSatish Balay   PetscFunctionBegin;
3822d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3824f1af5d2fSBarry Smith   t  = a->solve_work;
38254e2b4712SSatish Balay 
38264e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
38274e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
38284e2b4712SSatish Balay 
38294e2b4712SSatish Balay   /* forward solve the lower triangular */
38304e2b4712SSatish Balay   idx    = 2*(*r++);
3831f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
38324e2b4712SSatish Balay   for (i=1; i<n; i++) {
38334e2b4712SSatish Balay     v     = aa + 4*ai[i];
38344e2b4712SSatish Balay     vi    = aj + ai[i];
38354e2b4712SSatish Balay     nz    = diag[i] - ai[i];
38364e2b4712SSatish Balay     idx   = 2*(*r++);
3837f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
38384e2b4712SSatish Balay     while (nz--) {
38394e2b4712SSatish Balay       idx   = 2*(*vi++);
3840f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3841f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3842f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
38434e2b4712SSatish Balay       v += 4;
38444e2b4712SSatish Balay     }
38454e2b4712SSatish Balay     idx = 2*i;
3846f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
38474e2b4712SSatish Balay   }
38484e2b4712SSatish Balay   /* backward solve the upper triangular */
38494e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
38504e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
38514e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
38524e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
38534e2b4712SSatish Balay     idt  = 2*i;
3854f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
38554e2b4712SSatish Balay     while (nz--) {
38564e2b4712SSatish Balay       idx   = 2*(*vi++);
3857f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3858f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3859f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
38604e2b4712SSatish Balay       v += 4;
38614e2b4712SSatish Balay     }
38624e2b4712SSatish Balay     idc = 2*(*c--);
38634e2b4712SSatish Balay     v   = aa + 4*diag[i];
3864f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3865f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
38664e2b4712SSatish Balay   }
38674e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
38684e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3869d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3871dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
38724e2b4712SSatish Balay   PetscFunctionReturn(0);
38734e2b4712SSatish Balay }
38744e2b4712SSatish Balay 
38758f690400SShri Abhyankar #undef __FUNCT__
38768f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
38778f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
38788f690400SShri Abhyankar {
38798f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
38808f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
38818f690400SShri Abhyankar   PetscErrorCode    ierr;
388229b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
38838f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
38848f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
38858f690400SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
38868f690400SShri Abhyankar   const PetscScalar *b;
38878f690400SShri Abhyankar 
38888f690400SShri Abhyankar   PetscFunctionBegin;
38898f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38908f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
38918f690400SShri Abhyankar   t  = a->solve_work;
38928f690400SShri Abhyankar 
38938f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
389429b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
38958f690400SShri Abhyankar 
38968f690400SShri Abhyankar   /* forward solve the lower triangular */
389729b92fc1SShri Abhyankar   idx    = 2*r[0];
38988f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
38998f690400SShri Abhyankar   for (i=1; i<n; i++) {
39008f690400SShri Abhyankar     v     = aa + 4*ai[i];
39018f690400SShri Abhyankar     vi    = aj + ai[i];
39028f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
390329b92fc1SShri Abhyankar     idx   = 2*r[i];
39048f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
390529b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
390629b92fc1SShri Abhyankar       jdx   = 2*vi[m];
39078f690400SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
39088f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
39098f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
39108f690400SShri Abhyankar       v += 4;
39118f690400SShri Abhyankar     }
39128f690400SShri Abhyankar     idx = 2*i;
39138f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
39148f690400SShri Abhyankar   }
39158f690400SShri Abhyankar   /* backward solve the upper triangular */
39168f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
39178f690400SShri Abhyankar     k = 2*n-i;
39188f690400SShri Abhyankar     v    = aa + 4*ai[k];
39198f690400SShri Abhyankar     vi   = aj + ai[k];
39208f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
39218f690400SShri Abhyankar     idt  = 2*i;
39228f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
392329b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
392429b92fc1SShri Abhyankar       idx   = 2*vi[m];
39258f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
39268f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
39278f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
39288f690400SShri Abhyankar       v += 4;
39298f690400SShri Abhyankar     }
393029b92fc1SShri Abhyankar     idc = 2*c[i];
39318f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
39328f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
39338f690400SShri Abhyankar   }
39348f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
39358f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
39368f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39378f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
39388f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
39398f690400SShri Abhyankar   PetscFunctionReturn(0);
39408f690400SShri Abhyankar }
39418f690400SShri Abhyankar 
39428f690400SShri Abhyankar 
394315091d37SBarry Smith /*
394415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
394515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
394615091d37SBarry Smith */
39474a2ae208SSatish Balay #undef __FUNCT__
39484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3949dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
395015091d37SBarry Smith {
395115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3952690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3953dfbe8321SBarry Smith   PetscErrorCode    ierr;
3954690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3955d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3956d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
3957d9fead3dSBarry Smith   const PetscScalar *b;
3958690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
395915091d37SBarry Smith 
396015091d37SBarry Smith   PetscFunctionBegin;
3961d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39621ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
396315091d37SBarry Smith 
396415091d37SBarry Smith   /* forward solve the lower triangular */
396515091d37SBarry Smith   idx    = 0;
396615091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
396715091d37SBarry Smith   for (i=1; i<n; i++) {
396815091d37SBarry Smith     v     =  aa      + 4*ai[i];
396915091d37SBarry Smith     vi    =  aj      + ai[i];
397015091d37SBarry Smith     nz    =  diag[i] - ai[i];
397115091d37SBarry Smith     idx   +=  2;
3972f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
397315091d37SBarry Smith     while (nz--) {
397415091d37SBarry Smith       jdx   = 2*(*vi++);
397515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
3976f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3977f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
397815091d37SBarry Smith       v    += 4;
397915091d37SBarry Smith     }
3980f1af5d2fSBarry Smith     x[idx]   = s1;
3981f1af5d2fSBarry Smith     x[1+idx] = s2;
398215091d37SBarry Smith   }
398315091d37SBarry Smith   /* backward solve the upper triangular */
398415091d37SBarry Smith   for (i=n-1; i>=0; i--){
398515091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
398615091d37SBarry Smith     vi   = aj + diag[i] + 1;
398715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
398815091d37SBarry Smith     idt  = 2*i;
3989f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
399015091d37SBarry Smith     while (nz--) {
399115091d37SBarry Smith       idx   = 2*(*vi++);
399215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
3993f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3994f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
399515091d37SBarry Smith       v    += 4;
399615091d37SBarry Smith     }
399715091d37SBarry Smith     v        = aa +  4*diag[i];
3998f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
3999f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
400015091d37SBarry Smith   }
400115091d37SBarry Smith 
4002d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4004dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
400515091d37SBarry Smith   PetscFunctionReturn(0);
400615091d37SBarry Smith }
400715091d37SBarry Smith 
40084a2ae208SSatish Balay #undef __FUNCT__
4009cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4010cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4011cee9d6f2SShri Abhyankar {
4012cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4013ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4014cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4015cee9d6f2SShri Abhyankar     PetscInt          jdx;
4016cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4017cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4018cee9d6f2SShri Abhyankar     const PetscScalar *b;
4019cee9d6f2SShri Abhyankar 
4020cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4021cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4022cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4023cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4024cee9d6f2SShri Abhyankar     idx    = 0;
4025cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4026cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4027cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
4028cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4029cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4030cee9d6f2SShri Abhyankar        idx  = 2*i;
4031cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4032ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4033ce3d78c0SShri Abhyankar          jdx   = 2*vi[k];
4034cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4035cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4036cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4037cee9d6f2SShri Abhyankar            v   +=  4;
4038cee9d6f2SShri Abhyankar         }
4039cee9d6f2SShri Abhyankar        x[idx]   = s1;
4040cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4041cee9d6f2SShri Abhyankar     }
4042cee9d6f2SShri Abhyankar 
4043cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4044cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4045cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
4046cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4047cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4048cee9d6f2SShri Abhyankar      idt = 2*i;
4049cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4050ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4051ce3d78c0SShri Abhyankar       idx   = 2*vi[k];
4052cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4053cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4054cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4055cee9d6f2SShri Abhyankar          v    += 4;
4056cee9d6f2SShri Abhyankar     }
4057cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4058cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4059cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4060cee9d6f2SShri Abhyankar   }
4061cee9d6f2SShri Abhyankar 
4062cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4063cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4064cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4065cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4066cee9d6f2SShri Abhyankar }
4067cee9d6f2SShri Abhyankar 
4068cee9d6f2SShri Abhyankar #undef __FUNCT__
4069*b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4070*b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4071*b2b2dd24SShri Abhyankar {
4072*b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4073*b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4074*b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4075*b2b2dd24SShri Abhyankar     PetscInt          jdx;
4076*b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4077*b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4078*b2b2dd24SShri Abhyankar     const PetscScalar *b;
4079*b2b2dd24SShri Abhyankar 
4080*b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4081*b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4082*b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4083*b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4084*b2b2dd24SShri Abhyankar     idx    = 0;
4085*b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4086*b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4087*b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4088*b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4089*b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4090*b2b2dd24SShri Abhyankar        idx  = 2*i;
4091*b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4092*b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4093*b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4094*b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4095*b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4096*b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4097*b2b2dd24SShri Abhyankar            v   +=  4;
4098*b2b2dd24SShri Abhyankar         }
4099*b2b2dd24SShri Abhyankar        x[idx]   = s1;
4100*b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4101*b2b2dd24SShri Abhyankar     }
4102*b2b2dd24SShri Abhyankar 
4103*b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4104*b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4105*b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4106*b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4107*b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4108*b2b2dd24SShri Abhyankar      idt = 2*i;
4109*b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4110*b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4111*b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4112*b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4113*b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4114*b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4115*b2b2dd24SShri Abhyankar          v    += 4;
4116*b2b2dd24SShri Abhyankar     }
4117*b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4118*b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4119*b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4120*b2b2dd24SShri Abhyankar   }
4121*b2b2dd24SShri Abhyankar 
4122*b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4123*b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4124*b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4125*b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4126*b2b2dd24SShri Abhyankar }
4127*b2b2dd24SShri Abhyankar 
4128*b2b2dd24SShri Abhyankar #undef __FUNCT__
41294a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4130dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
41314e2b4712SSatish Balay {
41324e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
41334e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
41346849ba73SBarry Smith   PetscErrorCode ierr;
41355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
41365d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
41373f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
413887828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
41394e2b4712SSatish Balay 
41404e2b4712SSatish Balay   PetscFunctionBegin;
41414e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
41424e2b4712SSatish Balay 
41431ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
41441ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4145f1af5d2fSBarry Smith   t  = a->solve_work;
41464e2b4712SSatish Balay 
41474e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
41484e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
41494e2b4712SSatish Balay 
41504e2b4712SSatish Balay   /* forward solve the lower triangular */
4151f1af5d2fSBarry Smith   t[0] = b[*r++];
41524e2b4712SSatish Balay   for (i=1; i<n; i++) {
41534e2b4712SSatish Balay     v     = aa + ai[i];
41544e2b4712SSatish Balay     vi    = aj + ai[i];
41554e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4156f1af5d2fSBarry Smith     s1  = b[*r++];
41574e2b4712SSatish Balay     while (nz--) {
4158f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
41594e2b4712SSatish Balay     }
4160f1af5d2fSBarry Smith     t[i] = s1;
41614e2b4712SSatish Balay   }
41624e2b4712SSatish Balay   /* backward solve the upper triangular */
41634e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
41644e2b4712SSatish Balay     v    = aa + diag[i] + 1;
41654e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
41664e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4167f1af5d2fSBarry Smith     s1 = t[i];
41684e2b4712SSatish Balay     while (nz--) {
4169f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
41704e2b4712SSatish Balay     }
4171f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
41724e2b4712SSatish Balay   }
41734e2b4712SSatish Balay 
41744e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
41754e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
41761ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
41771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4178dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
41794e2b4712SSatish Balay   PetscFunctionReturn(0);
41804e2b4712SSatish Balay }
418115091d37SBarry Smith /*
418215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
418315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
418415091d37SBarry Smith */
41854a2ae208SSatish Balay #undef __FUNCT__
41864a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4187dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
418815091d37SBarry Smith {
418915091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4190690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4191dfbe8321SBarry Smith   PetscErrorCode ierr;
4192690b6cddSBarry Smith   PetscInt       *diag = a->diag;
419315091d37SBarry Smith   MatScalar      *aa=a->a;
419487828ca2SBarry Smith   PetscScalar    *x,*b;
419587828ca2SBarry Smith   PetscScalar    s1,x1;
419615091d37SBarry Smith   MatScalar      *v;
4197690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
419815091d37SBarry Smith 
419915091d37SBarry Smith   PetscFunctionBegin;
42001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
42011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
420215091d37SBarry Smith 
420315091d37SBarry Smith   /* forward solve the lower triangular */
420415091d37SBarry Smith   idx    = 0;
420515091d37SBarry Smith   x[0]   = b[0];
420615091d37SBarry Smith   for (i=1; i<n; i++) {
420715091d37SBarry Smith     v     =  aa      + ai[i];
420815091d37SBarry Smith     vi    =  aj      + ai[i];
420915091d37SBarry Smith     nz    =  diag[i] - ai[i];
421015091d37SBarry Smith     idx   +=  1;
4211f1af5d2fSBarry Smith     s1  =  b[idx];
421215091d37SBarry Smith     while (nz--) {
421315091d37SBarry Smith       jdx   = *vi++;
421415091d37SBarry Smith       x1    = x[jdx];
4215f1af5d2fSBarry Smith       s1 -= v[0]*x1;
421615091d37SBarry Smith       v    += 1;
421715091d37SBarry Smith     }
4218f1af5d2fSBarry Smith     x[idx]   = s1;
421915091d37SBarry Smith   }
422015091d37SBarry Smith   /* backward solve the upper triangular */
422115091d37SBarry Smith   for (i=n-1; i>=0; i--){
422215091d37SBarry Smith     v    = aa + diag[i] + 1;
422315091d37SBarry Smith     vi   = aj + diag[i] + 1;
422415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
422515091d37SBarry Smith     idt  = i;
4226f1af5d2fSBarry Smith     s1 = x[idt];
422715091d37SBarry Smith     while (nz--) {
422815091d37SBarry Smith       idx   = *vi++;
422915091d37SBarry Smith       x1    = x[idx];
4230f1af5d2fSBarry Smith       s1 -= v[0]*x1;
423115091d37SBarry Smith       v    += 1;
423215091d37SBarry Smith     }
423315091d37SBarry Smith     v        = aa +  diag[i];
4234f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
423515091d37SBarry Smith   }
42361ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
42371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4238dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
423915091d37SBarry Smith   PetscFunctionReturn(0);
424015091d37SBarry Smith }
42414e2b4712SSatish Balay 
42424e2b4712SSatish Balay /* ----------------------------------------------------------------*/
424316a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
42446bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
42456bce7ff8SHong Zhang 
424684a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
42478f690400SShri Abhyankar extern PetscErrorCode MatSolve_SeqBAIJ_N_newdatastruct(Mat,Vec,Vec);
42488f690400SShri Abhyankar 
42496bce7ff8SHong Zhang #undef __FUNCT__
42506bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
42516bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
42526bce7ff8SHong Zhang {
42536bce7ff8SHong Zhang   Mat            C=B;
42546bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
42556bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
42566bce7ff8SHong Zhang   PetscErrorCode ierr;
42576bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
42586bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
42596bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4260b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4261914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4262914a18a2SHong Zhang   MatScalar      *v_work;
42636bce7ff8SHong Zhang 
42646bce7ff8SHong Zhang   PetscFunctionBegin;
42656bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
42666bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4267914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4268914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
42696bce7ff8SHong Zhang   ics  = ic;
42706bce7ff8SHong Zhang 
4271914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
4272914a18a2SHong Zhang   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4273b588c5a2SHong Zhang   mwork    = v_work + bs;
4274b588c5a2SHong Zhang   v_pivots = (PetscInt*)(mwork + bs2);
4275914a18a2SHong Zhang 
42766bce7ff8SHong Zhang   for (i=0; i<n; i++){
42776bce7ff8SHong Zhang     /* zero rtmp */
42786bce7ff8SHong Zhang     /* L part */
42796bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
42806bce7ff8SHong Zhang     bjtmp = bj + bi[i];
4281914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4282914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4283914a18a2SHong Zhang     }
42846bce7ff8SHong Zhang 
42856bce7ff8SHong Zhang     /* U part */
42866bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
42876bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
4288914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4289914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4290914a18a2SHong Zhang     }
42916bce7ff8SHong Zhang 
42926bce7ff8SHong Zhang     /* load in initial (unfactored row) */
42936bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
42946bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
4295914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
42966bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4297914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
42986bce7ff8SHong Zhang     }
42996bce7ff8SHong Zhang 
43006bce7ff8SHong Zhang     /* elimination */
43016bce7ff8SHong Zhang     bjtmp = bj + bi[i];
43026bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
4303b1646270SShri Abhyankar     for(k=0;k < nzL;k++) {
4304b1646270SShri Abhyankar       row = bjtmp[k];
4305914a18a2SHong Zhang       pc = rtmp + bs2*row;
4306914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4307914a18a2SHong Zhang       if (flg) {
4308914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
4309b588c5a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
43106bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
4311914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
43126bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
4313914a18a2SHong Zhang         for (j=0; j<nz; j++) {
4314914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4315914a18a2SHong Zhang         }
4316b588c5a2SHong Zhang         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
43176bce7ff8SHong Zhang       }
43186bce7ff8SHong Zhang     }
43196bce7ff8SHong Zhang 
43206bce7ff8SHong Zhang     /* finished row so stick it into b->a */
43216bce7ff8SHong Zhang     /* L part */
4322914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
43236bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
43246bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
43256bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4326914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
43276bce7ff8SHong Zhang     }
43286bce7ff8SHong Zhang 
43296bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
4330914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
43316bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
4332914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4333914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4334914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
43356bce7ff8SHong Zhang 
43366bce7ff8SHong Zhang     /* U part */
4337914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
43386bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
43396bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
4340914a18a2SHong Zhang     for (j=0; j<nz; j++){
4341914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4342914a18a2SHong Zhang     }
43436bce7ff8SHong Zhang   }
43446bce7ff8SHong Zhang 
43456bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
43466bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
43476bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
43486bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
434927019359SHong Zhang 
43506bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
4351914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
43526bce7ff8SHong Zhang   PetscFunctionReturn(0);
43536bce7ff8SHong Zhang }
43546bce7ff8SHong Zhang 
43556bce7ff8SHong Zhang /*
43566bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
435716a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
435816a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
43596bce7ff8SHong Zhang */
43606bce7ff8SHong Zhang #undef __FUNCT__
43616bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
43626bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
43636bce7ff8SHong Zhang {
43646bce7ff8SHong Zhang 
43656bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
43666bce7ff8SHong Zhang   PetscErrorCode     ierr;
436716a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
436816a2bf60SHong Zhang   PetscInt           i,j,nz,*bi,*bj,*bdiag;
43696bce7ff8SHong Zhang 
43706bce7ff8SHong Zhang   PetscFunctionBegin;
437116a2bf60SHong Zhang   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
437216a2bf60SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
43736bce7ff8SHong Zhang   b    = (Mat_SeqBAIJ*)(fact)->data;
437416a2bf60SHong Zhang 
437516a2bf60SHong Zhang   /* allocate matrix arrays for new data structure */
437616a2bf60SHong Zhang   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
437716a2bf60SHong Zhang   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
437816a2bf60SHong Zhang   b->singlemalloc = PETSC_TRUE;
437916a2bf60SHong Zhang   if (!b->diag){
438016a2bf60SHong Zhang     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
438116a2bf60SHong Zhang   }
4382914a18a2SHong Zhang   bdiag = b->diag;
43836bce7ff8SHong Zhang 
438416a2bf60SHong Zhang   if (n > 0) {
438516a2bf60SHong Zhang     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
43866bce7ff8SHong Zhang   }
43876bce7ff8SHong Zhang 
43886bce7ff8SHong Zhang   /* set bi and bj with new data structure */
43896bce7ff8SHong Zhang   bi = b->i;
43906bce7ff8SHong Zhang   bj = b->j;
43916bce7ff8SHong Zhang 
43926bce7ff8SHong Zhang   /* L part */
43936bce7ff8SHong Zhang   bi[0] = 0;
439416a2bf60SHong Zhang   for (i=0; i<n; i++){
43956bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
4396914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
43976bce7ff8SHong Zhang     aj = a->j + ai[i];
43986bce7ff8SHong Zhang     for (j=0; j<nz; j++){
43996bce7ff8SHong Zhang       *bj = aj[j]; bj++;
44006bce7ff8SHong Zhang     }
44016bce7ff8SHong Zhang   }
44026bce7ff8SHong Zhang 
44036bce7ff8SHong Zhang   /* U part */
440416a2bf60SHong Zhang   bi[n+1] = bi[n];
440516a2bf60SHong Zhang   for (i=n-1; i>=0; i--){
44066bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
440716a2bf60SHong Zhang     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
44086bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
44096bce7ff8SHong Zhang     for (j=0; j<nz; j++){
44106bce7ff8SHong Zhang       *bj = aj[j]; bj++;
44116bce7ff8SHong Zhang     }
44126bce7ff8SHong Zhang     /* diag[i] */
44136bce7ff8SHong Zhang     *bj = i; bj++;
441416a2bf60SHong Zhang     bdiag[i] = bi[2*n-i+1]-1;
44156bce7ff8SHong Zhang   }
44166bce7ff8SHong Zhang   PetscFunctionReturn(0);
44176bce7ff8SHong Zhang }
44186bce7ff8SHong Zhang 
441916a2bf60SHong Zhang #undef __FUNCT__
442016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
442116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
442216a2bf60SHong Zhang {
442316a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
442416a2bf60SHong Zhang   IS                 isicol;
442516a2bf60SHong Zhang   PetscErrorCode     ierr;
442616a2bf60SHong Zhang   const PetscInt     *r,*ic;
44277fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
442816a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
442916a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
443016a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
44317fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
443216a2bf60SHong Zhang   PetscReal          f;
443316a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
443416a2bf60SHong Zhang   PetscBT            lnkbt;
443516a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
443616a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
443716a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
443816a2bf60SHong Zhang   PetscTruth         missing;
44397fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
444016a2bf60SHong Zhang 
444116a2bf60SHong Zhang   PetscFunctionBegin;
444216a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
444316a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
444416a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
444516a2bf60SHong Zhang 
444616a2bf60SHong Zhang   f             = info->fill;
444716a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
444816a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
444916a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
445016a2bf60SHong Zhang 
445116a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
445216a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
44537fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
445416a2bf60SHong Zhang 
44557fa3a6a0SHong Zhang   if (!levels && both_identity) {
445616a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
445716a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
445816a2bf60SHong Zhang     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
44597fa3a6a0SHong Zhang     /* set MatSolve routines */
44607fa3a6a0SHong Zhang     switch (bs){
44617fa3a6a0SHong Zhang     case 2:
44627fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
44637fa3a6a0SHong Zhang       break;
44647fa3a6a0SHong Zhang     case 3:
44657fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
44667fa3a6a0SHong Zhang       break;
44677fa3a6a0SHong Zhang     case 4:
44687fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
44697fa3a6a0SHong Zhang       break;
44707fa3a6a0SHong Zhang     case 5:
44717fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
44727fa3a6a0SHong Zhang       break;
44737fa3a6a0SHong Zhang     case 6:
44747fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
44757fa3a6a0SHong Zhang       break;
44767fa3a6a0SHong Zhang     case 7:
44777fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
44787fa3a6a0SHong Zhang       break;
44797fa3a6a0SHong Zhang     default:
44807fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
44817fa3a6a0SHong Zhang       break;
44827fa3a6a0SHong Zhang     }
448316a2bf60SHong Zhang 
448416a2bf60SHong Zhang     fact->factor = MAT_FACTOR_ILU;
448516a2bf60SHong Zhang     (fact)->info.factor_mallocs    = 0;
448616a2bf60SHong Zhang     (fact)->info.fill_ratio_given  = info->fill;
448716a2bf60SHong Zhang     (fact)->info.fill_ratio_needed = 1.0;
448816a2bf60SHong Zhang     b                = (Mat_SeqBAIJ*)(fact)->data;
448916a2bf60SHong Zhang     b->row           = isrow;
449016a2bf60SHong Zhang     b->col           = iscol;
449116a2bf60SHong Zhang     b->icol          = isicol;
449216a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
449316a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
449416a2bf60SHong Zhang     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4495b588c5a2SHong Zhang     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
449616a2bf60SHong Zhang     PetscFunctionReturn(0);
449716a2bf60SHong Zhang   }
449816a2bf60SHong Zhang 
449916a2bf60SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
450016a2bf60SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
450116a2bf60SHong Zhang 
450216a2bf60SHong Zhang   /* get new row pointers */
450316a2bf60SHong Zhang   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
450416a2bf60SHong Zhang   bi[0] = 0;
450516a2bf60SHong Zhang   /* bdiag is location of diagonal in factor */
450616a2bf60SHong Zhang   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
450716a2bf60SHong Zhang   bdiag[0]  = 0;
450816a2bf60SHong Zhang 
450916a2bf60SHong Zhang   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
451016a2bf60SHong Zhang   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
451116a2bf60SHong Zhang 
451216a2bf60SHong Zhang   /* create a linked list for storing column indices of the active row */
451316a2bf60SHong Zhang   nlnk = n + 1;
451416a2bf60SHong Zhang   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
451516a2bf60SHong Zhang 
451616a2bf60SHong Zhang   /* initial FreeSpace size is f*(ai[n]+1) */
451716a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
451816a2bf60SHong Zhang   current_space = free_space;
451916a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
452016a2bf60SHong Zhang   current_space_lvl = free_space_lvl;
452116a2bf60SHong Zhang 
452216a2bf60SHong Zhang   for (i=0; i<n; i++) {
452316a2bf60SHong Zhang     nzi = 0;
452416a2bf60SHong Zhang     /* copy current row into linked list */
452516a2bf60SHong Zhang     nnz  = ai[r[i]+1] - ai[r[i]];
452616a2bf60SHong Zhang     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
452716a2bf60SHong Zhang     cols = aj + ai[r[i]];
452816a2bf60SHong Zhang     lnk[i] = -1; /* marker to indicate if diagonal exists */
452916a2bf60SHong Zhang     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
453016a2bf60SHong Zhang     nzi += nlnk;
453116a2bf60SHong Zhang 
453216a2bf60SHong Zhang     /* make sure diagonal entry is included */
453316a2bf60SHong Zhang     if (diagonal_fill && lnk[i] == -1) {
453416a2bf60SHong Zhang       fm = n;
453516a2bf60SHong Zhang       while (lnk[fm] < i) fm = lnk[fm];
453616a2bf60SHong Zhang       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
453716a2bf60SHong Zhang       lnk[fm]    = i;
453816a2bf60SHong Zhang       lnk_lvl[i] = 0;
453916a2bf60SHong Zhang       nzi++; dcount++;
454016a2bf60SHong Zhang     }
454116a2bf60SHong Zhang 
454216a2bf60SHong Zhang     /* add pivot rows into the active row */
454316a2bf60SHong Zhang     nzbd = 0;
454416a2bf60SHong Zhang     prow = lnk[n];
454516a2bf60SHong Zhang     while (prow < i) {
454616a2bf60SHong Zhang       nnz      = bdiag[prow];
454716a2bf60SHong Zhang       cols     = bj_ptr[prow] + nnz + 1;
454816a2bf60SHong Zhang       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
454916a2bf60SHong Zhang       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
455016a2bf60SHong Zhang       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
455116a2bf60SHong Zhang       nzi += nlnk;
455216a2bf60SHong Zhang       prow = lnk[prow];
455316a2bf60SHong Zhang       nzbd++;
455416a2bf60SHong Zhang     }
455516a2bf60SHong Zhang     bdiag[i] = nzbd;
455616a2bf60SHong Zhang     bi[i+1]  = bi[i] + nzi;
455716a2bf60SHong Zhang 
455816a2bf60SHong Zhang     /* if free space is not available, make more free space */
455916a2bf60SHong Zhang     if (current_space->local_remaining<nzi) {
456016a2bf60SHong Zhang       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
456116a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
456216a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
456316a2bf60SHong Zhang       reallocs++;
456416a2bf60SHong Zhang     }
456516a2bf60SHong Zhang 
456616a2bf60SHong Zhang     /* copy data into free_space and free_space_lvl, then initialize lnk */
456716a2bf60SHong Zhang     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
456816a2bf60SHong Zhang     bj_ptr[i]    = current_space->array;
456916a2bf60SHong Zhang     bjlvl_ptr[i] = current_space_lvl->array;
457016a2bf60SHong Zhang 
457116a2bf60SHong Zhang     /* make sure the active row i has diagonal entry */
457216a2bf60SHong Zhang     if (*(bj_ptr[i]+bdiag[i]) != i) {
457316a2bf60SHong Zhang       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
457416a2bf60SHong Zhang     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
457516a2bf60SHong Zhang     }
457616a2bf60SHong Zhang 
457716a2bf60SHong Zhang     current_space->array           += nzi;
457816a2bf60SHong Zhang     current_space->local_used      += nzi;
457916a2bf60SHong Zhang     current_space->local_remaining -= nzi;
458016a2bf60SHong Zhang     current_space_lvl->array           += nzi;
458116a2bf60SHong Zhang     current_space_lvl->local_used      += nzi;
458216a2bf60SHong Zhang     current_space_lvl->local_remaining -= nzi;
458316a2bf60SHong Zhang   }
458416a2bf60SHong Zhang 
458516a2bf60SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
458616a2bf60SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
458716a2bf60SHong Zhang 
458816a2bf60SHong Zhang   /* destroy list of free space and other temporary arrays */
458916a2bf60SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
459016a2bf60SHong Zhang 
459116a2bf60SHong Zhang   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
4592783ef271SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
459316a2bf60SHong Zhang 
459416a2bf60SHong Zhang   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
459516a2bf60SHong Zhang   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
459616a2bf60SHong Zhang   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
459716a2bf60SHong Zhang 
459816a2bf60SHong Zhang #if defined(PETSC_USE_INFO)
459916a2bf60SHong Zhang   {
460016a2bf60SHong Zhang     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
460116a2bf60SHong Zhang     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
460216a2bf60SHong Zhang     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
460316a2bf60SHong Zhang     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
460416a2bf60SHong Zhang     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
460516a2bf60SHong Zhang     if (diagonal_fill) {
460616a2bf60SHong Zhang       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
460716a2bf60SHong Zhang     }
460816a2bf60SHong Zhang   }
460916a2bf60SHong Zhang #endif
461016a2bf60SHong Zhang 
461116a2bf60SHong Zhang   /* put together the new matrix */
461216a2bf60SHong Zhang   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
461316a2bf60SHong Zhang   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
461416a2bf60SHong Zhang   b = (Mat_SeqBAIJ*)(fact)->data;
461516a2bf60SHong Zhang   b->free_a       = PETSC_TRUE;
461616a2bf60SHong Zhang   b->free_ij      = PETSC_TRUE;
461716a2bf60SHong Zhang   b->singlemalloc = PETSC_FALSE;
46187fa3a6a0SHong Zhang   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
461916a2bf60SHong Zhang   b->j          = bj;
462016a2bf60SHong Zhang   b->i          = bi;
462116a2bf60SHong Zhang   b->diag       = bdiag;
46227f53bb6cSHong Zhang   b->free_diag  = PETSC_TRUE;
462316a2bf60SHong Zhang   b->ilen       = 0;
462416a2bf60SHong Zhang   b->imax       = 0;
462516a2bf60SHong Zhang   b->row        = isrow;
462616a2bf60SHong Zhang   b->col        = iscol;
462716a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
462816a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
462916a2bf60SHong Zhang   b->icol       = isicol;
46307fa3a6a0SHong Zhang   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
463116a2bf60SHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
463216a2bf60SHong Zhang      Allocate bdiag, solve_work, new a, new j */
46337fa3a6a0SHong Zhang   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
463416a2bf60SHong Zhang   b->maxnz = b->nz = bi[2*n+1] ;
463516a2bf60SHong Zhang   (fact)->info.factor_mallocs    = reallocs;
463616a2bf60SHong Zhang   (fact)->info.fill_ratio_given  = f;
463716a2bf60SHong Zhang   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
463816a2bf60SHong Zhang   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
46397fa3a6a0SHong Zhang   /* set MatSolve routines */
46407fa3a6a0SHong Zhang   if (both_identity){
46417fa3a6a0SHong Zhang     switch (bs){
46427fa3a6a0SHong Zhang     case 2:
46437fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
46447fa3a6a0SHong Zhang       break;
46457fa3a6a0SHong Zhang     case 3:
46467fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
46477fa3a6a0SHong Zhang       break;
46487fa3a6a0SHong Zhang     case 4:
46497fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
46507fa3a6a0SHong Zhang       break;
46517fa3a6a0SHong Zhang     case 5:
46527fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
46537fa3a6a0SHong Zhang       break;
46547fa3a6a0SHong Zhang     case 6:
46557fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
46567fa3a6a0SHong Zhang       break;
46577fa3a6a0SHong Zhang     case 7:
46587fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
46597fa3a6a0SHong Zhang       break;
46607fa3a6a0SHong Zhang     default:
46617fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
46627fa3a6a0SHong Zhang       break;
46637fa3a6a0SHong Zhang     }
46647fa3a6a0SHong Zhang   } else {
46657fa3a6a0SHong Zhang     switch (bs){
46667fa3a6a0SHong Zhang     case 2:
46677fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
46687fa3a6a0SHong Zhang       break;
46697fa3a6a0SHong Zhang     case 3:
46707fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
46717fa3a6a0SHong Zhang       break;
46727fa3a6a0SHong Zhang     case 4:
46737fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
46747fa3a6a0SHong Zhang       break;
46757fa3a6a0SHong Zhang     case 5:
46767fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
46777fa3a6a0SHong Zhang       break;
46787fa3a6a0SHong Zhang     case 6:
46797fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
46807fa3a6a0SHong Zhang       break;
46817fa3a6a0SHong Zhang     case 7:
46827fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
46837fa3a6a0SHong Zhang       break;
46847fa3a6a0SHong Zhang     default:
46857fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
46867fa3a6a0SHong Zhang       break;
46877fa3a6a0SHong Zhang     }
46887fa3a6a0SHong Zhang   }
468916a2bf60SHong Zhang   PetscFunctionReturn(0);
469016a2bf60SHong Zhang }
469116a2bf60SHong Zhang 
46924e2b4712SSatish Balay /*
46934e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
46944e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
46954e2b4712SSatish Balay    Not a good example of code reuse.
46964e2b4712SSatish Balay */
46974a2ae208SSatish Balay #undef __FUNCT__
46984a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
46990481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
47004e2b4712SSatish Balay {
47014e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
47024e2b4712SSatish Balay   IS             isicol;
47036849ba73SBarry Smith   PetscErrorCode ierr;
47045d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
47055d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4706a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4707d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
470841df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
4709329f5518SBarry Smith   PetscReal      f;
471016a2bf60SHong Zhang   PetscTruth     newdatastruct=PETSC_FALSE;
47114e2b4712SSatish Balay 
47124e2b4712SSatish Balay   PetscFunctionBegin;
471316a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
471416a2bf60SHong Zhang   if (newdatastruct){
471516a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
471616a2bf60SHong Zhang     PetscFunctionReturn(0);
471716a2bf60SHong Zhang   }
471816a2bf60SHong Zhang 
47196bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
47206bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
47216bce7ff8SHong Zhang 
4722435faa5fSBarry Smith   f             = info->fill;
4723690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
4724690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
47254c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
472616a2bf60SHong Zhang 
4727667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4728667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
47297d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
4730309c388cSBarry Smith 
473141df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
473216a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
47336bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
47346bce7ff8SHong Zhang 
4735719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
4736719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
4737bb3d539aSBarry Smith     b->row       = isrow;
4738bb3d539aSBarry Smith     b->col       = iscol;
4739bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4740bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4741bb3d539aSBarry Smith     b->icol      = isicol;
4742bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4743b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
47446bce7ff8SHong Zhang     PetscFunctionReturn(0);
47456bce7ff8SHong Zhang   }
47466bce7ff8SHong Zhang 
47476bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
47484e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
47494e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
47504e2b4712SSatish Balay 
47514e2b4712SSatish Balay     /* get new row pointers */
4752690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
47534e2b4712SSatish Balay     ainew[0] = 0;
47544e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
4755690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
4756690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
47574e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
4758690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
47594e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
4760690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
47614e2b4712SSatish Balay     /* im is level for each filled value */
4762690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
47634e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
4764690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
47654e2b4712SSatish Balay     dloc[0]  = 0;
47664e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
4767435faa5fSBarry Smith 
4768435faa5fSBarry Smith       /* copy prow into linked list */
47694e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
47703b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
47714e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
47724e2b4712SSatish Balay       fill[n]    = n;
4773435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
47744e2b4712SSatish Balay       while (nz--) {
47754e2b4712SSatish Balay 	fm  = n;
47764e2b4712SSatish Balay 	idx = ic[*xi++];
47774e2b4712SSatish Balay 	do {
47784e2b4712SSatish Balay 	  m  = fm;
47794e2b4712SSatish Balay 	  fm = fill[m];
47804e2b4712SSatish Balay 	} while (fm < idx);
47814e2b4712SSatish Balay 	fill[m]   = idx;
47824e2b4712SSatish Balay 	fill[idx] = fm;
47834e2b4712SSatish Balay 	im[idx]   = 0;
47844e2b4712SSatish Balay       }
4785435faa5fSBarry Smith 
4786435faa5fSBarry Smith       /* make sure diagonal entry is included */
4787435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
4788435faa5fSBarry Smith 	fm = n;
4789435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
4790435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
4791435faa5fSBarry Smith 	fill[fm]   = prow;
4792435faa5fSBarry Smith 	im[prow]   = 0;
4793435faa5fSBarry Smith 	nzf++;
4794335d9088SBarry Smith 	dcount++;
4795435faa5fSBarry Smith       }
4796435faa5fSBarry Smith 
47974e2b4712SSatish Balay       nzi = 0;
47984e2b4712SSatish Balay       row = fill[n];
47994e2b4712SSatish Balay       while (row < prow) {
48004e2b4712SSatish Balay 	incrlev = im[row] + 1;
48014e2b4712SSatish Balay 	nz      = dloc[row];
4802435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
48034e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
48044e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
48054e2b4712SSatish Balay 	fm      = row;
48064e2b4712SSatish Balay 	while (nnz-- > 0) {
48074e2b4712SSatish Balay 	  idx = *xi++;
48084e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
48094e2b4712SSatish Balay 	    flev++;
48104e2b4712SSatish Balay 	    continue;
48114e2b4712SSatish Balay 	  }
48124e2b4712SSatish Balay 	  do {
48134e2b4712SSatish Balay 	    m  = fm;
48144e2b4712SSatish Balay 	    fm = fill[m];
48154e2b4712SSatish Balay 	  } while (fm < idx);
48164e2b4712SSatish Balay 	  if (fm != idx) {
48174e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
48184e2b4712SSatish Balay 	    fill[m]   = idx;
48194e2b4712SSatish Balay 	    fill[idx] = fm;
48204e2b4712SSatish Balay 	    fm        = idx;
48214e2b4712SSatish Balay 	    nzf++;
4822ecf371e4SBarry Smith 	  } else {
48234e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
48244e2b4712SSatish Balay 	  }
48254e2b4712SSatish Balay 	  flev++;
48264e2b4712SSatish Balay 	}
48274e2b4712SSatish Balay 	row = fill[row];
48284e2b4712SSatish Balay 	nzi++;
48294e2b4712SSatish Balay       }
48304e2b4712SSatish Balay       /* copy new filled row into permanent storage */
48314e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
48324e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
4833ecf371e4SBarry Smith 
4834ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
4835ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
4836ecf371e4SBarry Smith 	/* just double the memory each time */
4837690b6cddSBarry Smith 	PetscInt maxadd = jmax;
4838ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
48394e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
48404e2b4712SSatish Balay 	jmax += maxadd;
4841ecf371e4SBarry Smith 
4842ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
48435d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
48445d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4845606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
48465d0c19d7SBarry Smith 	ajnew = xitmp;
48475d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
48485d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4849606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
48505d0c19d7SBarry Smith 	ajfill = xitmp;
4851eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
48524e2b4712SSatish Balay       }
48535d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
48544e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
48554e2b4712SSatish Balay       dloc[prow]  = nzi;
48564e2b4712SSatish Balay       fm          = fill[n];
48574e2b4712SSatish Balay       while (nzf--) {
48585d0c19d7SBarry Smith 	*xitmp++ = fm;
48594e2b4712SSatish Balay 	*flev++ = im[fm];
48604e2b4712SSatish Balay 	fm      = fill[fm];
48614e2b4712SSatish Balay       }
4862435faa5fSBarry Smith       /* make sure row has diagonal entry */
4863435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
486477431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
48652401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
4866435faa5fSBarry Smith       }
48674e2b4712SSatish Balay     }
4868606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
48694e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
48704e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4871606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
4872606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
48734e2b4712SSatish Balay 
48746cf91177SBarry Smith #if defined(PETSC_USE_INFO)
48754e2b4712SSatish Balay     {
4876329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
4877ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
4878ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4879ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
4880ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4881335d9088SBarry Smith       if (diagonal_fill) {
4882ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
4883335d9088SBarry Smith       }
48844e2b4712SSatish Balay     }
488563ba0a88SBarry Smith #endif
48864e2b4712SSatish Balay 
48874e2b4712SSatish Balay     /* put together the new matrix */
4888719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4889719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4890719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
4891e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
4892e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
48937c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
4894a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
48954e2b4712SSatish Balay     b->j          = ajnew;
48964e2b4712SSatish Balay     b->i          = ainew;
48974e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
48984e2b4712SSatish Balay     b->diag       = dloc;
48997f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
49004e2b4712SSatish Balay     b->ilen       = 0;
49014e2b4712SSatish Balay     b->imax       = 0;
49024e2b4712SSatish Balay     b->row        = isrow;
49034e2b4712SSatish Balay     b->col        = iscol;
4904bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4905c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4906c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4907e51c0b9cSSatish Balay     b->icol       = isicol;
490887828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
49094e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
49104e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
4911719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
49124e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
49134e2b4712SSatish Balay 
4914719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
4915719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
4916719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
49176bce7ff8SHong Zhang 
491841df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
49198661488fSKris Buschelman   PetscFunctionReturn(0);
49208661488fSKris Buschelman }
49218661488fSKris Buschelman 
4922732ee342SKris Buschelman #undef __FUNCT__
49237e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
4924dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
49257e7071cdSKris Buschelman {
492612272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
492712272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
49285a9542e3SKris Buschelman   PetscFunctionBegin;
49297cf1b8d3SKris Buschelman   /* Undo Column scaling */
49307cf1b8d3SKris Buschelman /*    while (nz--) { */
49317cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
49327cf1b8d3SKris Buschelman /*    } */
4933c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
4934c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
49357cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
49367cf1b8d3SKris Buschelman }
49377cf1b8d3SKris Buschelman 
49387cf1b8d3SKris Buschelman #undef __FUNCT__
49397cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
4940dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
49417cf1b8d3SKris Buschelman {
49427cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4943b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
49442aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
49455a9542e3SKris Buschelman   PetscFunctionBegin;
49460b9da03eSKris Buschelman   /* Is this really necessary? */
494720235379SKris Buschelman   while (nz--) {
49480b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
49497e7071cdSKris Buschelman   }
4950c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
49517e7071cdSKris Buschelman   PetscFunctionReturn(0);
49527e7071cdSKris Buschelman }
49537e7071cdSKris Buschelman 
4954732ee342SKris Buschelman 
4955