xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 0c4413a7bab1432485e668e274eb71cdbea65977)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120f1af5d2fSBarry Smith {
121f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122dfbe8321SBarry Smith   PetscErrorCode ierr;
123690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
125f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12787828ca2SBarry Smith   PetscScalar    *x,*b;
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   PetscFunctionBegin;
130ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith   /* forward solve the U^T */
135f1af5d2fSBarry Smith   idx = 0;
136f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
139f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
140ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144f1af5d2fSBarry Smith     v += 9;
145f1af5d2fSBarry Smith 
146f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
147f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
148f1af5d2fSBarry Smith     while (nz--) {
149f1af5d2fSBarry Smith       oidx = 3*(*vi++);
150f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153f1af5d2fSBarry Smith       v  += 9;
154f1af5d2fSBarry Smith     }
155f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156f1af5d2fSBarry Smith     idx += 3;
157f1af5d2fSBarry Smith   }
158f1af5d2fSBarry Smith   /* backward solve the L^T */
159f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
160f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
161f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
162f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
163f1af5d2fSBarry Smith     idt  = 3*i;
164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165f1af5d2fSBarry Smith     while (nz--) {
166f1af5d2fSBarry Smith       idx   = 3*(*vi--);
167f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170f1af5d2fSBarry Smith       v -= 9;
171f1af5d2fSBarry Smith     }
172f1af5d2fSBarry Smith   }
1731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176f1af5d2fSBarry Smith   PetscFunctionReturn(0);
177f1af5d2fSBarry Smith }
178f1af5d2fSBarry Smith 
1794a2ae208SSatish Balay #undef __FUNCT__
1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182f1af5d2fSBarry Smith {
183f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184dfbe8321SBarry Smith   PetscErrorCode ierr;
185690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
187f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18987828ca2SBarry Smith   PetscScalar    *x,*b;
190f1af5d2fSBarry Smith 
191f1af5d2fSBarry Smith   PetscFunctionBegin;
192ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith   /* forward solve the U^T */
197f1af5d2fSBarry Smith   idx = 0;
198f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
199f1af5d2fSBarry Smith 
200f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
201f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
202ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207f1af5d2fSBarry Smith     v += 16;
208f1af5d2fSBarry Smith 
209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
211f1af5d2fSBarry Smith     while (nz--) {
212f1af5d2fSBarry Smith       oidx = 4*(*vi++);
213f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217f1af5d2fSBarry Smith       v  += 16;
218f1af5d2fSBarry Smith     }
219f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220f1af5d2fSBarry Smith     idx += 4;
221f1af5d2fSBarry Smith   }
222f1af5d2fSBarry Smith   /* backward solve the L^T */
223f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
224f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
225f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
226f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
227f1af5d2fSBarry Smith     idt  = 4*i;
228f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229f1af5d2fSBarry Smith     while (nz--) {
230f1af5d2fSBarry Smith       idx   = 4*(*vi--);
231f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235f1af5d2fSBarry Smith       v -= 16;
236f1af5d2fSBarry Smith     }
237f1af5d2fSBarry Smith   }
2381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241f1af5d2fSBarry Smith   PetscFunctionReturn(0);
242f1af5d2fSBarry Smith }
243f1af5d2fSBarry Smith 
2444a2ae208SSatish Balay #undef __FUNCT__
2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247f1af5d2fSBarry Smith {
248f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249dfbe8321SBarry Smith   PetscErrorCode ierr;
250690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
252f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25487828ca2SBarry Smith   PetscScalar    *x,*b;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith   PetscFunctionBegin;
257ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2581ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2591ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith   /* forward solve the U^T */
262f1af5d2fSBarry Smith   idx = 0;
263f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
266f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
267ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273f1af5d2fSBarry Smith     v += 25;
274f1af5d2fSBarry Smith 
275f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
276f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
277f1af5d2fSBarry Smith     while (nz--) {
278f1af5d2fSBarry Smith       oidx = 5*(*vi++);
279f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284f1af5d2fSBarry Smith       v  += 25;
285f1af5d2fSBarry Smith     }
286f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287f1af5d2fSBarry Smith     idx += 5;
288f1af5d2fSBarry Smith   }
289f1af5d2fSBarry Smith   /* backward solve the L^T */
290f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
291f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
292f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
293f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
294f1af5d2fSBarry Smith     idt  = 5*i;
295f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296f1af5d2fSBarry Smith     while (nz--) {
297f1af5d2fSBarry Smith       idx   = 5*(*vi--);
298f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303f1af5d2fSBarry Smith       v -= 25;
304f1af5d2fSBarry Smith     }
305f1af5d2fSBarry Smith   }
3061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309f1af5d2fSBarry Smith   PetscFunctionReturn(0);
310f1af5d2fSBarry Smith }
311f1af5d2fSBarry Smith 
3124a2ae208SSatish Balay #undef __FUNCT__
3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315f1af5d2fSBarry Smith {
316f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317dfbe8321SBarry Smith   PetscErrorCode ierr;
318690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
320f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
32187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32287828ca2SBarry Smith   PetscScalar    *x,*b;
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith   PetscFunctionBegin;
325ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith   /* forward solve the U^T */
330f1af5d2fSBarry Smith   idx = 0;
331f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
332f1af5d2fSBarry Smith 
333f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
334f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
335ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336ef66eb69SBarry Smith     x6    = x[5+idx];
337f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343f1af5d2fSBarry Smith     v += 36;
344f1af5d2fSBarry Smith 
345f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
346f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
347f1af5d2fSBarry Smith     while (nz--) {
348f1af5d2fSBarry Smith       oidx = 6*(*vi++);
349f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355f1af5d2fSBarry Smith       v  += 36;
356f1af5d2fSBarry Smith     }
357f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358f1af5d2fSBarry Smith     x[5+idx] = s6;
359f1af5d2fSBarry Smith     idx += 6;
360f1af5d2fSBarry Smith   }
361f1af5d2fSBarry Smith   /* backward solve the L^T */
362f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
363f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
364f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
365f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
366f1af5d2fSBarry Smith     idt  = 6*i;
367f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368f1af5d2fSBarry Smith     s6 = x[5+idt];
369f1af5d2fSBarry Smith     while (nz--) {
370f1af5d2fSBarry Smith       idx   = 6*(*vi--);
371f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377f1af5d2fSBarry Smith       v -= 36;
378f1af5d2fSBarry Smith     }
379f1af5d2fSBarry Smith   }
3801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383f1af5d2fSBarry Smith   PetscFunctionReturn(0);
384f1af5d2fSBarry Smith }
385f1af5d2fSBarry Smith 
3864a2ae208SSatish Balay #undef __FUNCT__
3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389f1af5d2fSBarry Smith {
390f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391dfbe8321SBarry Smith   PetscErrorCode ierr;
392690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
394f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39687828ca2SBarry Smith   PetscScalar    *x,*b;
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith   PetscFunctionBegin;
399ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith   /* forward solve the U^T */
404f1af5d2fSBarry Smith   idx = 0;
405f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
406f1af5d2fSBarry Smith 
407f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
408f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
409ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
411f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418f1af5d2fSBarry Smith     v += 49;
419f1af5d2fSBarry Smith 
420f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
421f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
422f1af5d2fSBarry Smith     while (nz--) {
423f1af5d2fSBarry Smith       oidx = 7*(*vi++);
424f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431f1af5d2fSBarry Smith       v  += 49;
432f1af5d2fSBarry Smith     }
433f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
435f1af5d2fSBarry Smith     idx += 7;
436f1af5d2fSBarry Smith   }
437f1af5d2fSBarry Smith   /* backward solve the L^T */
438f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
439f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
440f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
441f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
442f1af5d2fSBarry Smith     idt  = 7*i;
443f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
445f1af5d2fSBarry Smith     while (nz--) {
446f1af5d2fSBarry Smith       idx   = 7*(*vi--);
447f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454f1af5d2fSBarry Smith       v -= 49;
455f1af5d2fSBarry Smith     }
456f1af5d2fSBarry Smith   }
4571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460f1af5d2fSBarry Smith   PetscFunctionReturn(0);
461f1af5d2fSBarry Smith }
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4644a2ae208SSatish Balay #undef __FUNCT__
4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467f1af5d2fSBarry Smith {
468f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
4715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473690b6cddSBarry Smith   PetscInt       *diag = a->diag;
474f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47587828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   PetscFunctionBegin;
4781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480f1af5d2fSBarry Smith   t  = a->solve_work;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith     t[i] = b[c[i]];
488f1af5d2fSBarry Smith   }
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith   /* forward solve the U^T */
491f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith     v     = aa + diag[i];
494f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
495f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith     t[i]   = s1;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     s1   = t[i];
509f1af5d2fSBarry Smith     while (nz--) {
510f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   /* copy t into x according to permutation */
515f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
516f1af5d2fSBarry Smith     x[r[i]]   = t[i];
517f1af5d2fSBarry Smith   }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5211ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   PetscFunctionReturn(0);
525f1af5d2fSBarry Smith }
526f1af5d2fSBarry Smith 
5274a2ae208SSatish Balay #undef __FUNCT__
5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530f1af5d2fSBarry Smith {
531f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5336849ba73SBarry Smith   PetscErrorCode ierr;
5345d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53887828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   PetscFunctionBegin;
5421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544f1af5d2fSBarry Smith   t  = a->solve_work;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
550f1af5d2fSBarry Smith   ii = 0;
551f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
552f1af5d2fSBarry Smith     ic      = 2*c[i];
553f1af5d2fSBarry Smith     t[ii]   = b[ic];
554f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
555f1af5d2fSBarry Smith     ii += 2;
556f1af5d2fSBarry Smith   }
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith   /* forward solve the U^T */
559f1af5d2fSBarry Smith   idx = 0;
560f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
561f1af5d2fSBarry Smith 
562f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
563f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
564f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
565f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
566f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
567f1af5d2fSBarry Smith     v += 4;
568f1af5d2fSBarry Smith 
569f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
570f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
571f1af5d2fSBarry Smith     while (nz--) {
572f1af5d2fSBarry Smith       oidx = 2*(*vi++);
573f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575f1af5d2fSBarry Smith       v  += 4;
576f1af5d2fSBarry Smith     }
577f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
578f1af5d2fSBarry Smith     idx += 2;
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith   /* backward solve the L^T */
581f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
582f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
583f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
584f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
585f1af5d2fSBarry Smith     idt  = 2*i;
586f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
587f1af5d2fSBarry Smith     while (nz--) {
588f1af5d2fSBarry Smith       idx   = 2*(*vi--);
589f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591f1af5d2fSBarry Smith       v -= 4;
592f1af5d2fSBarry Smith     }
593f1af5d2fSBarry Smith   }
594f1af5d2fSBarry Smith 
595f1af5d2fSBarry Smith   /* copy t into x according to permutation */
596f1af5d2fSBarry Smith   ii = 0;
597f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
598f1af5d2fSBarry Smith     ir      = 2*r[i];
599f1af5d2fSBarry Smith     x[ir]   = t[ii];
600f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
601f1af5d2fSBarry Smith     ii += 2;
602f1af5d2fSBarry Smith   }
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609f1af5d2fSBarry Smith   PetscFunctionReturn(0);
610f1af5d2fSBarry Smith }
611f1af5d2fSBarry Smith 
6124a2ae208SSatish Balay #undef __FUNCT__
6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615f1af5d2fSBarry Smith {
616f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6186849ba73SBarry Smith   PetscErrorCode ierr;
6195d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   PetscFunctionBegin;
6271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629f1af5d2fSBarry Smith   t  = a->solve_work;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
635f1af5d2fSBarry Smith   ii = 0;
636f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
637f1af5d2fSBarry Smith     ic      = 3*c[i];
638f1af5d2fSBarry Smith     t[ii]   = b[ic];
639f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
640f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
641f1af5d2fSBarry Smith     ii += 3;
642f1af5d2fSBarry Smith   }
643f1af5d2fSBarry Smith 
644f1af5d2fSBarry Smith   /* forward solve the U^T */
645f1af5d2fSBarry Smith   idx = 0;
646f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
647f1af5d2fSBarry Smith 
648f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
649f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
650f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654f1af5d2fSBarry Smith     v += 9;
655f1af5d2fSBarry Smith 
656f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
657f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
658f1af5d2fSBarry Smith     while (nz--) {
659f1af5d2fSBarry Smith       oidx = 3*(*vi++);
660f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663f1af5d2fSBarry Smith       v  += 9;
664f1af5d2fSBarry Smith     }
665f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666f1af5d2fSBarry Smith     idx += 3;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 3*i;
674f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 3*(*vi--);
677f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680f1af5d2fSBarry Smith       v -= 9;
681f1af5d2fSBarry Smith     }
682f1af5d2fSBarry Smith   }
683f1af5d2fSBarry Smith 
684f1af5d2fSBarry Smith   /* copy t into x according to permutation */
685f1af5d2fSBarry Smith   ii = 0;
686f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
687f1af5d2fSBarry Smith     ir      = 3*r[i];
688f1af5d2fSBarry Smith     x[ir]   = t[ii];
689f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
690f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
691f1af5d2fSBarry Smith     ii += 3;
692f1af5d2fSBarry Smith   }
693f1af5d2fSBarry Smith 
694f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699f1af5d2fSBarry Smith   PetscFunctionReturn(0);
700f1af5d2fSBarry Smith }
701f1af5d2fSBarry Smith 
7024a2ae208SSatish Balay #undef __FUNCT__
7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705f1af5d2fSBarry Smith {
706f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7086849ba73SBarry Smith   PetscErrorCode ierr;
7095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   PetscFunctionBegin;
7171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719f1af5d2fSBarry Smith   t  = a->solve_work;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
725f1af5d2fSBarry Smith   ii = 0;
726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
727f1af5d2fSBarry Smith     ic      = 4*c[i];
728f1af5d2fSBarry Smith     t[ii]   = b[ic];
729f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
730f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
731f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
732f1af5d2fSBarry Smith     ii += 4;
733f1af5d2fSBarry Smith   }
734f1af5d2fSBarry Smith 
735f1af5d2fSBarry Smith   /* forward solve the U^T */
736f1af5d2fSBarry Smith   idx = 0;
737f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
738f1af5d2fSBarry Smith 
739f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
740f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
741f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746f1af5d2fSBarry Smith     v += 16;
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
749f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       oidx = 4*(*vi++);
752f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v  += 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759f1af5d2fSBarry Smith     idx += 4;
760f1af5d2fSBarry Smith   }
761f1af5d2fSBarry Smith   /* backward solve the L^T */
762f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
763f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
764f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
765f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
766f1af5d2fSBarry Smith     idt  = 4*i;
767f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768f1af5d2fSBarry Smith     while (nz--) {
769f1af5d2fSBarry Smith       idx   = 4*(*vi--);
770f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774f1af5d2fSBarry Smith       v -= 16;
775f1af5d2fSBarry Smith     }
776f1af5d2fSBarry Smith   }
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   /* copy t into x according to permutation */
779f1af5d2fSBarry Smith   ii = 0;
780f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
781f1af5d2fSBarry Smith     ir      = 4*r[i];
782f1af5d2fSBarry Smith     x[ir]   = t[ii];
783f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
784f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
785f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
786f1af5d2fSBarry Smith     ii += 4;
787f1af5d2fSBarry Smith   }
788f1af5d2fSBarry Smith 
789f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   PetscFunctionReturn(0);
795f1af5d2fSBarry Smith }
796f1af5d2fSBarry Smith 
7974a2ae208SSatish Balay #undef __FUNCT__
7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800f1af5d2fSBarry Smith {
801f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8036849ba73SBarry Smith   PetscErrorCode ierr;
8045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   PetscFunctionBegin;
8121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814f1af5d2fSBarry Smith   t  = a->solve_work;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
820f1af5d2fSBarry Smith   ii = 0;
821f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
822f1af5d2fSBarry Smith     ic      = 5*c[i];
823f1af5d2fSBarry Smith     t[ii]   = b[ic];
824f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
825f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
826f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
827f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
828f1af5d2fSBarry Smith     ii += 5;
829f1af5d2fSBarry Smith   }
830f1af5d2fSBarry Smith 
831f1af5d2fSBarry Smith   /* forward solve the U^T */
832f1af5d2fSBarry Smith   idx = 0;
833f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
834f1af5d2fSBarry Smith 
835f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
836f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
837f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843f1af5d2fSBarry Smith     v += 25;
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
846f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       oidx = 5*(*vi++);
849f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v  += 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857f1af5d2fSBarry Smith     idx += 5;
858f1af5d2fSBarry Smith   }
859f1af5d2fSBarry Smith   /* backward solve the L^T */
860f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
861f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
862f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
863f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
864f1af5d2fSBarry Smith     idt  = 5*i;
865f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866f1af5d2fSBarry Smith     while (nz--) {
867f1af5d2fSBarry Smith       idx   = 5*(*vi--);
868f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873f1af5d2fSBarry Smith       v -= 25;
874f1af5d2fSBarry Smith     }
875f1af5d2fSBarry Smith   }
876f1af5d2fSBarry Smith 
877f1af5d2fSBarry Smith   /* copy t into x according to permutation */
878f1af5d2fSBarry Smith   ii = 0;
879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
880f1af5d2fSBarry Smith     ir      = 5*r[i];
881f1af5d2fSBarry Smith     x[ir]   = t[ii];
882f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
883f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
884f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
885f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
886f1af5d2fSBarry Smith     ii += 5;
887f1af5d2fSBarry Smith   }
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894f1af5d2fSBarry Smith   PetscFunctionReturn(0);
895f1af5d2fSBarry Smith }
896f1af5d2fSBarry Smith 
8974a2ae208SSatish Balay #undef __FUNCT__
8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900f1af5d2fSBarry Smith {
901f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9036849ba73SBarry Smith   PetscErrorCode ierr;
9045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   PetscFunctionBegin;
9121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914f1af5d2fSBarry Smith   t  = a->solve_work;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
920f1af5d2fSBarry Smith   ii = 0;
921f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
922f1af5d2fSBarry Smith     ic      = 6*c[i];
923f1af5d2fSBarry Smith     t[ii]   = b[ic];
924f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
925f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
926f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
927f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
928f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
929f1af5d2fSBarry Smith     ii += 6;
930f1af5d2fSBarry Smith   }
931f1af5d2fSBarry Smith 
932f1af5d2fSBarry Smith   /* forward solve the U^T */
933f1af5d2fSBarry Smith   idx = 0;
934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
937f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
938f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939f1af5d2fSBarry Smith     x6    = t[5+idx];
940f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946f1af5d2fSBarry Smith     v += 36;
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
949f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       oidx = 6*(*vi++);
952f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v  += 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961f1af5d2fSBarry Smith     t[5+idx] = s6;
962f1af5d2fSBarry Smith     idx += 6;
963f1af5d2fSBarry Smith   }
964f1af5d2fSBarry Smith   /* backward solve the L^T */
965f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
966f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
967f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
968f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
969f1af5d2fSBarry Smith     idt  = 6*i;
970f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971f1af5d2fSBarry Smith     s6 = t[5+idt];
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       idx   = 6*(*vi--);
974f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980f1af5d2fSBarry Smith       v -= 36;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith   }
983f1af5d2fSBarry Smith 
984f1af5d2fSBarry Smith   /* copy t into x according to permutation */
985f1af5d2fSBarry Smith   ii = 0;
986f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
987f1af5d2fSBarry Smith     ir      = 6*r[i];
988f1af5d2fSBarry Smith     x[ir]   = t[ii];
989f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
990f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
991f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
992f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
993f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
994f1af5d2fSBarry Smith     ii += 6;
995f1af5d2fSBarry Smith   }
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1003f1af5d2fSBarry Smith }
1004f1af5d2fSBarry Smith 
10054a2ae208SSatish Balay #undef __FUNCT__
10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008f1af5d2fSBarry Smith {
1009f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10116849ba73SBarry Smith   PetscErrorCode ierr;
10125d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   PetscFunctionBegin;
10201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022f1af5d2fSBarry Smith   t  = a->solve_work;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1028f1af5d2fSBarry Smith   ii = 0;
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     ic      = 7*c[i];
1031f1af5d2fSBarry Smith     t[ii]   = b[ic];
1032f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1033f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1034f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1035f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1036f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1037f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1038f1af5d2fSBarry Smith     ii += 7;
1039f1af5d2fSBarry Smith   }
1040f1af5d2fSBarry Smith 
1041f1af5d2fSBarry Smith   /* forward solve the U^T */
1042f1af5d2fSBarry Smith   idx = 0;
1043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1044f1af5d2fSBarry Smith 
1045f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1046f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1047f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1049f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056f1af5d2fSBarry Smith     v += 49;
1057f1af5d2fSBarry Smith 
1058f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1059f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1062f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v  += 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1073f1af5d2fSBarry Smith     idx += 7;
1074f1af5d2fSBarry Smith   }
1075f1af5d2fSBarry Smith   /* backward solve the L^T */
1076f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1077f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1078f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1079f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1080f1af5d2fSBarry Smith     idt  = 7*i;
1081f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1083f1af5d2fSBarry Smith     while (nz--) {
1084f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1085f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092f1af5d2fSBarry Smith       v -= 49;
1093f1af5d2fSBarry Smith     }
1094f1af5d2fSBarry Smith   }
1095f1af5d2fSBarry Smith 
1096f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1097f1af5d2fSBarry Smith   ii = 0;
1098f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1099f1af5d2fSBarry Smith     ir      = 7*r[i];
1100f1af5d2fSBarry Smith     x[ir]   = t[ii];
1101f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1102f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1103f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1104f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1105f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1106f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1107f1af5d2fSBarry Smith     ii += 7;
1108f1af5d2fSBarry Smith   }
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1116f1af5d2fSBarry Smith }
1117f1af5d2fSBarry Smith 
11184e2b4712SSatish Balay /* ----------------------------------------------------------- */
11194a2ae208SSatish Balay #undef __FUNCT__
11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11224e2b4712SSatish Balay {
11234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11256849ba73SBarry Smith   PetscErrorCode ierr;
11265d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11285d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11293f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
113087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11314e2b4712SSatish Balay 
11324e2b4712SSatish Balay   PetscFunctionBegin;
11331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135f1af5d2fSBarry Smith   t  = a->solve_work;
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11394e2b4712SSatish Balay 
11404e2b4712SSatish Balay   /* forward solve the lower triangular */
114187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11424e2b4712SSatish Balay   for (i=1; i<n; i++) {
11434e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11444e2b4712SSatish Balay     vi  = aj + ai[i];
11454e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1146f1af5d2fSBarry Smith     s = t + bs*i;
114787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
11524e2b4712SSatish Balay   }
11534e2b4712SSatish Balay   /* backward solve the upper triangular */
1154d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11564e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11574e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11584e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11604e2b4712SSatish Balay     while (nz--) {
1161f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11624e2b4712SSatish Balay       v += bs2;
11634e2b4712SSatish Balay     }
1164f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11664e2b4712SSatish Balay   }
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11734e2b4712SSatish Balay   PetscFunctionReturn(0);
11744e2b4712SSatish Balay }
11754e2b4712SSatish Balay 
11764a2ae208SSatish Balay #undef __FUNCT__
11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11794e2b4712SSatish Balay {
11804e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11814e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11826849ba73SBarry Smith   PetscErrorCode ierr;
11835d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11845d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11853f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11884e2b4712SSatish Balay 
11894e2b4712SSatish Balay   PetscFunctionBegin;
11901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192f1af5d2fSBarry Smith   t  = a->solve_work;
11934e2b4712SSatish Balay 
11944e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11954e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11964e2b4712SSatish Balay 
11974e2b4712SSatish Balay   /* forward solve the lower triangular */
11984e2b4712SSatish Balay   idx    = 7*(*r++);
1199f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1200f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12024e2b4712SSatish Balay 
12034e2b4712SSatish Balay   for (i=1; i<n; i++) {
12044e2b4712SSatish Balay     v     = aa + 49*ai[i];
12054e2b4712SSatish Balay     vi    = aj + ai[i];
12064e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12074e2b4712SSatish Balay     idx   = 7*(*r++);
1208f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12104e2b4712SSatish Balay     while (nz--) {
12114e2b4712SSatish Balay       idx   = 7*(*vi++);
1212f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1214f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1215f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12224e2b4712SSatish Balay       v += 49;
12234e2b4712SSatish Balay     }
12244e2b4712SSatish Balay     idx = 7*i;
1225f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1226f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12284e2b4712SSatish Balay   }
12294e2b4712SSatish Balay   /* backward solve the upper triangular */
12304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12314e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12324e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12334e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12344e2b4712SSatish Balay     idt  = 7*i;
1235f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1236f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12384e2b4712SSatish Balay     while (nz--) {
12394e2b4712SSatish Balay       idx   = 7*(*vi++);
1240f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1241f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1243f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12504e2b4712SSatish Balay       v += 49;
12514e2b4712SSatish Balay     }
12524e2b4712SSatish Balay     idc = 7*(*c--);
12534e2b4712SSatish Balay     v   = aa + 49*diag[i];
1254f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12684e2b4712SSatish Balay   }
12694e2b4712SSatish Balay 
12704e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12714e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12721ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12754e2b4712SSatish Balay   PetscFunctionReturn(0);
12764e2b4712SSatish Balay }
12774e2b4712SSatish Balay 
12784a2ae208SSatish Balay #undef __FUNCT__
12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
12818f690400SShri Abhyankar {
12828f690400SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
12838f690400SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
12848f690400SShri Abhyankar   PetscErrorCode ierr;
12858f690400SShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
128629b92fc1SShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
12878f690400SShri Abhyankar   MatScalar      *aa=a->a,*v;
12888f690400SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
12898f690400SShri Abhyankar   PetscScalar    *x,*b,*t;
12908f690400SShri Abhyankar 
12918f690400SShri Abhyankar   PetscFunctionBegin;
12928f690400SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12938f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
12948f690400SShri Abhyankar   t  = a->solve_work;
12958f690400SShri Abhyankar 
12968f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
129729b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
12988f690400SShri Abhyankar 
12998f690400SShri Abhyankar   /* forward solve the lower triangular */
130029b92fc1SShri Abhyankar   idx    = 7*r[0];
13018f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
13028f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
13038f690400SShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
13048f690400SShri Abhyankar 
13058f690400SShri Abhyankar   for (i=1; i<n; i++) {
13068f690400SShri Abhyankar     v     = aa + 49*ai[i];
13078f690400SShri Abhyankar     vi    = aj + ai[i];
13088f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
130929b92fc1SShri Abhyankar     idx   = 7*r[i];
13108f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
13118f690400SShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
131229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
131329b92fc1SShri Abhyankar       idx   = 7*vi[m];
13148f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
13158f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
13168f690400SShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
13178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13228f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13238f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13248f690400SShri Abhyankar       v += 49;
13258f690400SShri Abhyankar     }
13268f690400SShri Abhyankar     idx = 7*i;
13278f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
13288f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
13298f690400SShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
13308f690400SShri Abhyankar   }
13318f690400SShri Abhyankar   /* backward solve the upper triangular */
13328f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
13338f690400SShri Abhyankar     k    = 2*n-i;
13348f690400SShri Abhyankar     v    = aa + 49*ai[k];
13358f690400SShri Abhyankar     vi   = aj + ai[k];
13368f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
13378f690400SShri Abhyankar     idt  = 7*i;
13388f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
13398f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
13408f690400SShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
134129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
134229b92fc1SShri Abhyankar       idx   = 7*vi[m];
13438f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
13448f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
13458f690400SShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
13468f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13478f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13488f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13498f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13508f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13518f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13528f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13538f690400SShri Abhyankar       v += 49;
13548f690400SShri Abhyankar     }
135529b92fc1SShri Abhyankar     idc = 7*c[i];
13568f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
13578f690400SShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
13588f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
13598f690400SShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
13608f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
13618f690400SShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
13628f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
13638f690400SShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
13648f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
13658f690400SShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
13668f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
13678f690400SShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
13688f690400SShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
13698f690400SShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13708f690400SShri Abhyankar   }
13718f690400SShri Abhyankar 
13728f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
13738f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13748f690400SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13758f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
13768f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
13778f690400SShri Abhyankar   PetscFunctionReturn(0);
13788f690400SShri Abhyankar }
13798f690400SShri Abhyankar 
13808f690400SShri Abhyankar #undef __FUNCT__
13814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
138315091d37SBarry Smith {
138415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1385690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1386dfbe8321SBarry Smith   PetscErrorCode    ierr;
1387690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1388d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1389d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1390d9fead3dSBarry Smith   const PetscScalar *b;
139115091d37SBarry Smith 
139215091d37SBarry Smith   PetscFunctionBegin;
1393d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
139515091d37SBarry Smith   /* forward solve the lower triangular */
139615091d37SBarry Smith   idx    = 0;
139715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
139815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
139915091d37SBarry Smith   x[6] = b[6+idx];
140015091d37SBarry Smith   for (i=1; i<n; i++) {
140115091d37SBarry Smith     v     =  aa + 49*ai[i];
140215091d37SBarry Smith     vi    =  aj + ai[i];
140315091d37SBarry Smith     nz    =  diag[i] - ai[i];
140415091d37SBarry Smith     idx   =  7*i;
1405f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1406f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1407f1af5d2fSBarry Smith     s7  =  b[6+idx];
140815091d37SBarry Smith     while (nz--) {
140915091d37SBarry Smith       jdx   = 7*(*vi++);
141015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
141115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
141215091d37SBarry Smith       x7    = x[6+jdx];
1413f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1414f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1415f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1416f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1417f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1418f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1419f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
142015091d37SBarry Smith       v += 49;
142115091d37SBarry Smith      }
1422f1af5d2fSBarry Smith     x[idx]   = s1;
1423f1af5d2fSBarry Smith     x[1+idx] = s2;
1424f1af5d2fSBarry Smith     x[2+idx] = s3;
1425f1af5d2fSBarry Smith     x[3+idx] = s4;
1426f1af5d2fSBarry Smith     x[4+idx] = s5;
1427f1af5d2fSBarry Smith     x[5+idx] = s6;
1428f1af5d2fSBarry Smith     x[6+idx] = s7;
142915091d37SBarry Smith   }
143015091d37SBarry Smith   /* backward solve the upper triangular */
143115091d37SBarry Smith   for (i=n-1; i>=0; i--){
143215091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
143315091d37SBarry Smith     vi   = aj + diag[i] + 1;
143415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
143515091d37SBarry Smith     idt  = 7*i;
1436f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1437f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1438f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1439f1af5d2fSBarry Smith     s7 = x[6+idt];
144015091d37SBarry Smith     while (nz--) {
144115091d37SBarry Smith       idx   = 7*(*vi++);
144215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
144315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
144415091d37SBarry Smith       x7    = x[6+idx];
1445f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1446f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1447f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1448f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1449f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1450f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1451f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
145215091d37SBarry Smith       v += 49;
145315091d37SBarry Smith     }
145415091d37SBarry Smith     v        = aa + 49*diag[i];
1455f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1456f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1457f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1458f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1459f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1460f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1461f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1462f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1463f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1464f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1465f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1466f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1467f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1468f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
146915091d37SBarry Smith   }
147015091d37SBarry Smith 
1471d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
147415091d37SBarry Smith   PetscFunctionReturn(0);
147515091d37SBarry Smith }
147615091d37SBarry Smith 
14774a2ae208SSatish Balay #undef __FUNCT__
1478cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1479cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1480cee9d6f2SShri Abhyankar {
1481cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
14826464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1483cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1484cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1485cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1486cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1487cee9d6f2SShri Abhyankar     PetscScalar       *x;
1488cee9d6f2SShri Abhyankar     const PetscScalar *b;
1489cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1490cee9d6f2SShri Abhyankar 
1491cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1492cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1493cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1494cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1495cee9d6f2SShri Abhyankar     idx    = 0;
1496cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1497cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1498cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1499cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1500cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1501cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1502cee9d6f2SShri Abhyankar       idx   = bs*i;
1503cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1504cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
15056464896eSShri Abhyankar        for(k=0;k<nz;k++) {
15066464896eSShri Abhyankar           jdx   = bs*vi[k];
1507cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1508cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1509cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1510cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1511cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1512cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1513cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1514cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1515cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1516cee9d6f2SShri Abhyankar           v   +=  bs2;
1517cee9d6f2SShri Abhyankar         }
1518cee9d6f2SShri Abhyankar 
1519cee9d6f2SShri Abhyankar        x[idx]   = s1;
1520cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1521cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1522cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1523cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1524cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1525cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1526cee9d6f2SShri Abhyankar     }
1527cee9d6f2SShri Abhyankar 
1528cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1529cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1530cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1531cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1532cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1533cee9d6f2SShri Abhyankar      idt = bs*i;
1534cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1535cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
15366464896eSShri Abhyankar     for(k=0;k<nz;k++) {
15376464896eSShri Abhyankar       idx   = bs*vi[k];
1538cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1539cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1540cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1541cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1542cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1543cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1544cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1545cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1546cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1547cee9d6f2SShri Abhyankar         v   +=  bs2;
1548cee9d6f2SShri Abhyankar     }
1549cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1550cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1551cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1552cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1553cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1554cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1555cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1556cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1557cee9d6f2SShri Abhyankar   }
1558cee9d6f2SShri Abhyankar 
1559cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1560cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1561cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1562cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1563cee9d6f2SShri Abhyankar }
1564cee9d6f2SShri Abhyankar 
1565cee9d6f2SShri Abhyankar #undef __FUNCT__
156653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
156753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
156853cca76cSShri Abhyankar {
156953cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
157053cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
157153cca76cSShri Abhyankar     PetscErrorCode    ierr;
157253cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
157353cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
157453cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
157553cca76cSShri Abhyankar     PetscScalar       *x;
157653cca76cSShri Abhyankar     const PetscScalar *b;
157753cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
157853cca76cSShri Abhyankar 
157953cca76cSShri Abhyankar     PetscFunctionBegin;
158053cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
158153cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
158253cca76cSShri Abhyankar     /* forward solve the lower triangular */
158353cca76cSShri Abhyankar     idx    = 0;
158453cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
158553cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
158653cca76cSShri Abhyankar     for (i=1; i<n; i++) {
158753cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
158853cca76cSShri Abhyankar        vi   = aj + ai[i];
158953cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
159053cca76cSShri Abhyankar       idx   = bs*i;
159153cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
159253cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
159353cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
159453cca76cSShri Abhyankar           jdx   = bs*vi[k];
159553cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
159653cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
159753cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
159853cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
159953cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
160053cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
160153cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
160253cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
160353cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
160453cca76cSShri Abhyankar           v   +=  bs2;
160553cca76cSShri Abhyankar         }
160653cca76cSShri Abhyankar 
160753cca76cSShri Abhyankar        x[idx]   = s1;
160853cca76cSShri Abhyankar        x[1+idx] = s2;
160953cca76cSShri Abhyankar        x[2+idx] = s3;
161053cca76cSShri Abhyankar        x[3+idx] = s4;
161153cca76cSShri Abhyankar        x[4+idx] = s5;
161253cca76cSShri Abhyankar        x[5+idx] = s6;
161353cca76cSShri Abhyankar        x[6+idx] = s7;
161453cca76cSShri Abhyankar     }
161553cca76cSShri Abhyankar 
161653cca76cSShri Abhyankar    /* backward solve the upper triangular */
161753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
161853cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
161953cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
162053cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
162153cca76cSShri Abhyankar      idt = bs*i;
162253cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
162353cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
162453cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
162553cca76cSShri Abhyankar       idx   = bs*vi[k];
162653cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
162753cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
162853cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
162953cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
163053cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
163153cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
163253cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
163353cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
163453cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
163553cca76cSShri Abhyankar         v   +=  bs2;
163653cca76cSShri Abhyankar     }
163753cca76cSShri Abhyankar     /* x = inv_diagonal*x */
163853cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
163953cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
164053cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
164153cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
164253cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
164353cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
164453cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
164553cca76cSShri Abhyankar   }
164653cca76cSShri Abhyankar 
164753cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
164853cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
164953cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
165053cca76cSShri Abhyankar   PetscFunctionReturn(0);
165153cca76cSShri Abhyankar }
165253cca76cSShri Abhyankar 
165353cca76cSShri Abhyankar #undef __FUNCT__
16544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1655dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
165615091d37SBarry Smith {
165715091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
165815091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
16596849ba73SBarry Smith   PetscErrorCode    ierr;
16605d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
16615d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1662d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1663d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1664d9fead3dSBarry Smith   const PetscScalar *b;
166515091d37SBarry Smith   PetscFunctionBegin;
1666d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16671ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668f1af5d2fSBarry Smith   t  = a->solve_work;
166915091d37SBarry Smith 
167015091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
167115091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
167215091d37SBarry Smith 
167315091d37SBarry Smith   /* forward solve the lower triangular */
167415091d37SBarry Smith   idx    = 6*(*r++);
1675f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1676f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1677f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
167815091d37SBarry Smith   for (i=1; i<n; i++) {
167915091d37SBarry Smith     v     = aa + 36*ai[i];
168015091d37SBarry Smith     vi    = aj + ai[i];
168115091d37SBarry Smith     nz    = diag[i] - ai[i];
168215091d37SBarry Smith     idx   = 6*(*r++);
1683f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1684f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
168515091d37SBarry Smith     while (nz--) {
168615091d37SBarry Smith       idx   = 6*(*vi++);
1687f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1688f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1689f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1690f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1691f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1692f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1693f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1694f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
169515091d37SBarry Smith       v += 36;
169615091d37SBarry Smith     }
169715091d37SBarry Smith     idx = 6*i;
1698f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1699f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1700f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
170115091d37SBarry Smith   }
170215091d37SBarry Smith   /* backward solve the upper triangular */
170315091d37SBarry Smith   for (i=n-1; i>=0; i--){
170415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
170515091d37SBarry Smith     vi   = aj + diag[i] + 1;
170615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
170715091d37SBarry Smith     idt  = 6*i;
1708f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1709f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1710f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
171115091d37SBarry Smith     while (nz--) {
171215091d37SBarry Smith       idx   = 6*(*vi++);
1713f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1714f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1715f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1716f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1717f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1718f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1719f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1720f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1721f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
172215091d37SBarry Smith       v += 36;
172315091d37SBarry Smith     }
172415091d37SBarry Smith     idc = 6*(*c--);
172515091d37SBarry Smith     v   = aa + 36*diag[i];
1726f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1727f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1728f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1729f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1730f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1731f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1732f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1733f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1734f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1735f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1736f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1737f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
173815091d37SBarry Smith   }
173915091d37SBarry Smith 
174015091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
174115091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1742d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1744dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
174515091d37SBarry Smith   PetscFunctionReturn(0);
174615091d37SBarry Smith }
174715091d37SBarry Smith 
17484a2ae208SSatish Balay #undef __FUNCT__
17498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
17508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
17518f690400SShri Abhyankar {
17528f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17538f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
17548f690400SShri Abhyankar   PetscErrorCode    ierr;
17558f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
175629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
17578f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
17588f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
17598f690400SShri Abhyankar   const PetscScalar *b;
17608f690400SShri Abhyankar   PetscFunctionBegin;
17618f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17628f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
17638f690400SShri Abhyankar   t  = a->solve_work;
17648f690400SShri Abhyankar 
17658f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
176629b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
17678f690400SShri Abhyankar 
17688f690400SShri Abhyankar   /* forward solve the lower triangular */
176929b92fc1SShri Abhyankar   idx    = 6*r[0];
17708f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
17718f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
17728f690400SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
17738f690400SShri Abhyankar   for (i=1; i<n; i++) {
17748f690400SShri Abhyankar     v     = aa + 36*ai[i];
17758f690400SShri Abhyankar     vi    = aj + ai[i];
17768f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
177729b92fc1SShri Abhyankar     idx   = 6*r[i];
17788f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17798f690400SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
178029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
178129b92fc1SShri Abhyankar       idx   = 6*vi[m];
17828f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
17838f690400SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
17848f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
17858f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
17868f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
17878f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
17888f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
17898f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
17908f690400SShri Abhyankar       v += 36;
17918f690400SShri Abhyankar     }
17928f690400SShri Abhyankar     idx = 6*i;
17938f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
17948f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
17958f690400SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
17968f690400SShri Abhyankar   }
17978f690400SShri Abhyankar   /* backward solve the upper triangular */
17988f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
17998f690400SShri Abhyankar     k    = 2*n-i;
18008f690400SShri Abhyankar     v    = aa + 36*ai[k];
18018f690400SShri Abhyankar     vi   = aj + ai[k];
18028f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
18038f690400SShri Abhyankar     idt  = 6*i;
18048f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
18058f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
18068f690400SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
180729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
180829b92fc1SShri Abhyankar       idx   = 6*vi[m];
18098f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
18108f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
18118f690400SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
18128f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
18138f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
18148f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
18158f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
18168f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
18178f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
18188f690400SShri Abhyankar       v += 36;
18198f690400SShri Abhyankar     }
182029b92fc1SShri Abhyankar     idc = 6*c[i];
18218f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
18228f690400SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
18238f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
18248f690400SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
18258f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
18268f690400SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
18278f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
18288f690400SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
18298f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
18308f690400SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
18318f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
18328f690400SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
18338f690400SShri Abhyankar   }
18348f690400SShri Abhyankar 
18358f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18368f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18378f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18388f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
18398f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
18408f690400SShri Abhyankar   PetscFunctionReturn(0);
18418f690400SShri Abhyankar }
18428f690400SShri Abhyankar 
18438f690400SShri Abhyankar 
18448f690400SShri Abhyankar #undef __FUNCT__
18454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1846dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
184715091d37SBarry Smith {
184815091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1849690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1850dfbe8321SBarry Smith   PetscErrorCode    ierr;
1851690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1852d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1853d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1854d9fead3dSBarry Smith   const PetscScalar *b;
185515091d37SBarry Smith 
185615091d37SBarry Smith   PetscFunctionBegin;
1857d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18581ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185915091d37SBarry Smith   /* forward solve the lower triangular */
186015091d37SBarry Smith   idx    = 0;
186115091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
186215091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
186315091d37SBarry Smith   for (i=1; i<n; i++) {
186415091d37SBarry Smith     v     =  aa + 36*ai[i];
186515091d37SBarry Smith     vi    =  aj + ai[i];
186615091d37SBarry Smith     nz    =  diag[i] - ai[i];
186715091d37SBarry Smith     idx   =  6*i;
1868f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1869f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
187015091d37SBarry Smith     while (nz--) {
187115091d37SBarry Smith       jdx   = 6*(*vi++);
187215091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
187315091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1874f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1875f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1876f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1877f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1878f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1879f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
188015091d37SBarry Smith       v += 36;
188115091d37SBarry Smith      }
1882f1af5d2fSBarry Smith     x[idx]   = s1;
1883f1af5d2fSBarry Smith     x[1+idx] = s2;
1884f1af5d2fSBarry Smith     x[2+idx] = s3;
1885f1af5d2fSBarry Smith     x[3+idx] = s4;
1886f1af5d2fSBarry Smith     x[4+idx] = s5;
1887f1af5d2fSBarry Smith     x[5+idx] = s6;
188815091d37SBarry Smith   }
188915091d37SBarry Smith   /* backward solve the upper triangular */
189015091d37SBarry Smith   for (i=n-1; i>=0; i--){
189115091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
189215091d37SBarry Smith     vi   = aj + diag[i] + 1;
189315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
189415091d37SBarry Smith     idt  = 6*i;
1895f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1896f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1897f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
189815091d37SBarry Smith     while (nz--) {
189915091d37SBarry Smith       idx   = 6*(*vi++);
190015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
190115091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1902f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1903f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1904f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1905f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1906f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1907f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
190815091d37SBarry Smith       v += 36;
190915091d37SBarry Smith     }
191015091d37SBarry Smith     v        = aa + 36*diag[i];
1911f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1912f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1913f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1914f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1915f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1916f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
191715091d37SBarry Smith   }
191815091d37SBarry Smith 
1919d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1921dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
192215091d37SBarry Smith   PetscFunctionReturn(0);
192315091d37SBarry Smith }
192415091d37SBarry Smith 
19254a2ae208SSatish Balay #undef __FUNCT__
1926cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1927cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1928cee9d6f2SShri Abhyankar {
1929cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
19306464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1931cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1932cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1933cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1934cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1935cee9d6f2SShri Abhyankar     PetscScalar       *x;
1936cee9d6f2SShri Abhyankar     const PetscScalar *b;
1937cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1938cee9d6f2SShri Abhyankar 
1939cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1940cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1941cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1942cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1943cee9d6f2SShri Abhyankar     idx    = 0;
1944cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1945cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
1946cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1947cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1948cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1949cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1950cee9d6f2SShri Abhyankar       idx   = bs*i;
1951cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1952cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
19536464896eSShri Abhyankar        for(k=0;k<nz;k++){
19546464896eSShri Abhyankar           jdx   = bs*vi[k];
1955cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1956cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1957cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1958cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1959cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1960cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1961cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1962cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1963cee9d6f2SShri Abhyankar           v   +=  bs2;
1964cee9d6f2SShri Abhyankar         }
1965cee9d6f2SShri Abhyankar 
1966cee9d6f2SShri Abhyankar        x[idx]   = s1;
1967cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1968cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1969cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1970cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1971cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1972cee9d6f2SShri Abhyankar     }
1973cee9d6f2SShri Abhyankar 
1974cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1975cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1976cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1977cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1978cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1979cee9d6f2SShri Abhyankar      idt = bs*i;
1980cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1981cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
19826464896eSShri Abhyankar      for(k=0;k<nz;k++){
19836464896eSShri Abhyankar       idx   = bs*vi[k];
1984cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1985cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
1986cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1987cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1988cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1989cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1990cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1991cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1992cee9d6f2SShri Abhyankar         v   +=  bs2;
1993cee9d6f2SShri Abhyankar     }
1994cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1995cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1996cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1997cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1998cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1999cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2000cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2001cee9d6f2SShri Abhyankar   }
2002cee9d6f2SShri Abhyankar 
2003cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2004cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2005cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2006cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2007cee9d6f2SShri Abhyankar }
20088f690400SShri Abhyankar 
2009cee9d6f2SShri Abhyankar #undef __FUNCT__
201053cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
201153cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
201253cca76cSShri Abhyankar {
201353cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
201453cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
201553cca76cSShri Abhyankar     PetscErrorCode    ierr;
201653cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
201753cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
201853cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
201953cca76cSShri Abhyankar     PetscScalar       *x;
202053cca76cSShri Abhyankar     const PetscScalar *b;
202153cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
202253cca76cSShri Abhyankar 
202353cca76cSShri Abhyankar     PetscFunctionBegin;
202453cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
202553cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
202653cca76cSShri Abhyankar     /* forward solve the lower triangular */
202753cca76cSShri Abhyankar     idx    = 0;
202853cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
202953cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
203053cca76cSShri Abhyankar     for (i=1; i<n; i++) {
203153cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
203253cca76cSShri Abhyankar        vi   = aj + ai[i];
203353cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
203453cca76cSShri Abhyankar       idx   = bs*i;
203553cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
203653cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
203753cca76cSShri Abhyankar        for(k=0;k<nz;k++){
203853cca76cSShri Abhyankar           jdx   = bs*vi[k];
203953cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
204053cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
204153cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
204253cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
204353cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
204453cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
204553cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
204653cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
204753cca76cSShri Abhyankar           v   +=  bs2;
204853cca76cSShri Abhyankar         }
204953cca76cSShri Abhyankar 
205053cca76cSShri Abhyankar        x[idx]   = s1;
205153cca76cSShri Abhyankar        x[1+idx] = s2;
205253cca76cSShri Abhyankar        x[2+idx] = s3;
205353cca76cSShri Abhyankar        x[3+idx] = s4;
205453cca76cSShri Abhyankar        x[4+idx] = s5;
205553cca76cSShri Abhyankar        x[5+idx] = s6;
205653cca76cSShri Abhyankar     }
205753cca76cSShri Abhyankar 
205853cca76cSShri Abhyankar    /* backward solve the upper triangular */
205953cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
206053cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
206153cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
206253cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
206353cca76cSShri Abhyankar      idt = bs*i;
206453cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
206553cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
206653cca76cSShri Abhyankar      for(k=0;k<nz;k++){
206753cca76cSShri Abhyankar       idx   = bs*vi[k];
206853cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
206953cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
207053cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
207153cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
207253cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
207353cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
207453cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
207553cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
207653cca76cSShri Abhyankar         v   +=  bs2;
207753cca76cSShri Abhyankar     }
207853cca76cSShri Abhyankar     /* x = inv_diagonal*x */
207953cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
208053cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
208153cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
208253cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
208353cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
208453cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
208553cca76cSShri Abhyankar   }
208653cca76cSShri Abhyankar 
208753cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
208853cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
208953cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
209053cca76cSShri Abhyankar   PetscFunctionReturn(0);
209153cca76cSShri Abhyankar }
209253cca76cSShri Abhyankar 
209353cca76cSShri Abhyankar #undef __FUNCT__
20944a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2095dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
20964e2b4712SSatish Balay {
20974e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
20984e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
20996849ba73SBarry Smith   PetscErrorCode    ierr;
21005d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
21015d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2102d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2103d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2104d9fead3dSBarry Smith   const PetscScalar *b;
21054e2b4712SSatish Balay 
21064e2b4712SSatish Balay   PetscFunctionBegin;
2107d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21081ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2109f1af5d2fSBarry Smith   t  = a->solve_work;
21104e2b4712SSatish Balay 
21114e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21124e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
21134e2b4712SSatish Balay 
21144e2b4712SSatish Balay   /* forward solve the lower triangular */
21154e2b4712SSatish Balay   idx    = 5*(*r++);
2116f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2117f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
21184e2b4712SSatish Balay   for (i=1; i<n; i++) {
21194e2b4712SSatish Balay     v     = aa + 25*ai[i];
21204e2b4712SSatish Balay     vi    = aj + ai[i];
21214e2b4712SSatish Balay     nz    = diag[i] - ai[i];
21224e2b4712SSatish Balay     idx   = 5*(*r++);
2123f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2124f1af5d2fSBarry Smith     s5  = b[4+idx];
21254e2b4712SSatish Balay     while (nz--) {
21264e2b4712SSatish Balay       idx   = 5*(*vi++);
2127f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2128f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2129f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2130f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2131f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2132f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2133f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
21344e2b4712SSatish Balay       v += 25;
21354e2b4712SSatish Balay     }
21364e2b4712SSatish Balay     idx = 5*i;
2137f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2138f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
21394e2b4712SSatish Balay   }
21404e2b4712SSatish Balay   /* backward solve the upper triangular */
21414e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21424e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
21434e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21444e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
21454e2b4712SSatish Balay     idt  = 5*i;
2146f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2147f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
21484e2b4712SSatish Balay     while (nz--) {
21494e2b4712SSatish Balay       idx   = 5*(*vi++);
2150f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2151f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2152f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2153f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2154f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2155f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2156f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
21574e2b4712SSatish Balay       v += 25;
21584e2b4712SSatish Balay     }
21594e2b4712SSatish Balay     idc = 5*(*c--);
21604e2b4712SSatish Balay     v   = aa + 25*diag[i];
2161f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2162f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
2163f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2164f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
2165f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2166f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
2167f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2168f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
2169f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2170f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
21714e2b4712SSatish Balay   }
21724e2b4712SSatish Balay 
21734e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21744e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2175d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21761ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2177dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
21784e2b4712SSatish Balay   PetscFunctionReturn(0);
21794e2b4712SSatish Balay }
21804e2b4712SSatish Balay 
21814a2ae208SSatish Balay #undef __FUNCT__
21828f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
21838f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
21848f690400SShri Abhyankar {
21858f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21868f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
21878f690400SShri Abhyankar   PetscErrorCode    ierr;
21888f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
218929b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
21908f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
21918f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
21928f690400SShri Abhyankar   const PetscScalar *b;
21938f690400SShri Abhyankar 
21948f690400SShri Abhyankar   PetscFunctionBegin;
21958f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21968f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21978f690400SShri Abhyankar   t  = a->solve_work;
21988f690400SShri Abhyankar 
21998f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
220029b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22018f690400SShri Abhyankar 
22028f690400SShri Abhyankar   /* forward solve the lower triangular */
220329b92fc1SShri Abhyankar   idx    = 5*r[0];
22048f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
22058f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
22068f690400SShri Abhyankar   for (i=1; i<n; i++) {
22078f690400SShri Abhyankar     v     = aa + 25*ai[i];
22088f690400SShri Abhyankar     vi    = aj + ai[i];
22098f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
221029b92fc1SShri Abhyankar     idx   = 5*r[i];
22118f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
22128f690400SShri Abhyankar     s5  = b[4+idx];
221329b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
221429b92fc1SShri Abhyankar       idx   = 5*vi[m];
22158f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
22168f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
22178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
22188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
22198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
22208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
22218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
22228f690400SShri Abhyankar       v += 25;
22238f690400SShri Abhyankar     }
22248f690400SShri Abhyankar     idx = 5*i;
22258f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
22268f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
22278f690400SShri Abhyankar   }
22288f690400SShri Abhyankar   /* backward solve the upper triangular */
22298f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
22308f690400SShri Abhyankar     k    = 2*n-i;
22318f690400SShri Abhyankar     v    = aa + 25*ai[k];
22328f690400SShri Abhyankar     vi   = aj + ai[k];
22338f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
22348f690400SShri Abhyankar     idt  = 5*i;
22358f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
22368f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
223729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
223829b92fc1SShri Abhyankar       idx   = 5*vi[m];
22398f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
22408f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
22418f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
22428f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
22438f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
22448f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
22458f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
22468f690400SShri Abhyankar       v += 25;
22478f690400SShri Abhyankar     }
224829b92fc1SShri Abhyankar     idc = 5*c[i];
22498f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
22508f690400SShri Abhyankar                                  v[15]*s4+v[20]*s5;
22518f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
22528f690400SShri Abhyankar                                  v[16]*s4+v[21]*s5;
22538f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
22548f690400SShri Abhyankar                                  v[17]*s4+v[22]*s5;
22558f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
22568f690400SShri Abhyankar                                  v[18]*s4+v[23]*s5;
22578f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
22588f690400SShri Abhyankar                                  v[19]*s4+v[24]*s5;
22598f690400SShri Abhyankar   }
22608f690400SShri Abhyankar 
22618f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22628f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22638f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22648f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22658f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
22668f690400SShri Abhyankar   PetscFunctionReturn(0);
22678f690400SShri Abhyankar }
22688f690400SShri Abhyankar #undef __FUNCT__
22694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2270dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
227115091d37SBarry Smith {
227215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2273690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2274dfbe8321SBarry Smith   PetscErrorCode    ierr;
2275690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2276d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2277d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2278d9fead3dSBarry Smith   const PetscScalar *b;
227915091d37SBarry Smith 
228015091d37SBarry Smith   PetscFunctionBegin;
2281d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
228315091d37SBarry Smith   /* forward solve the lower triangular */
228415091d37SBarry Smith   idx    = 0;
228515091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
228615091d37SBarry Smith   for (i=1; i<n; i++) {
228715091d37SBarry Smith     v     =  aa + 25*ai[i];
228815091d37SBarry Smith     vi    =  aj + ai[i];
228915091d37SBarry Smith     nz    =  diag[i] - ai[i];
229015091d37SBarry Smith     idx   =  5*i;
2291f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
229215091d37SBarry Smith     while (nz--) {
229315091d37SBarry Smith       jdx   = 5*(*vi++);
229415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2295f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2296f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2297f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2298f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2299f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
230015091d37SBarry Smith       v    += 25;
230115091d37SBarry Smith     }
2302f1af5d2fSBarry Smith     x[idx]   = s1;
2303f1af5d2fSBarry Smith     x[1+idx] = s2;
2304f1af5d2fSBarry Smith     x[2+idx] = s3;
2305f1af5d2fSBarry Smith     x[3+idx] = s4;
2306f1af5d2fSBarry Smith     x[4+idx] = s5;
230715091d37SBarry Smith   }
230815091d37SBarry Smith   /* backward solve the upper triangular */
230915091d37SBarry Smith   for (i=n-1; i>=0; i--){
231015091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
231115091d37SBarry Smith     vi   = aj + diag[i] + 1;
231215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
231315091d37SBarry Smith     idt  = 5*i;
2314f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2315f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
231615091d37SBarry Smith     while (nz--) {
231715091d37SBarry Smith       idx   = 5*(*vi++);
231815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2319f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2320f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2321f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2322f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2323f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
232415091d37SBarry Smith       v    += 25;
232515091d37SBarry Smith     }
232615091d37SBarry Smith     v        = aa + 25*diag[i];
2327f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2328f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2329f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2330f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2331f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
233215091d37SBarry Smith   }
233315091d37SBarry Smith 
2334d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23351ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2336dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
233715091d37SBarry Smith   PetscFunctionReturn(0);
233815091d37SBarry Smith }
233915091d37SBarry Smith 
23404a2ae208SSatish Balay #undef __FUNCT__
2341cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2342cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2343cee9d6f2SShri Abhyankar {
2344cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
23456464896eSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2346cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
2347cee9d6f2SShri Abhyankar   PetscInt          jdx;
2348cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2349cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2350cee9d6f2SShri Abhyankar   const PetscScalar *b;
2351cee9d6f2SShri Abhyankar 
2352cee9d6f2SShri Abhyankar   PetscFunctionBegin;
2353cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2354cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2355cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
2356cee9d6f2SShri Abhyankar   idx    = 0;
2357cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2358cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
2359cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
2360cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
2361cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
2362cee9d6f2SShri Abhyankar     idx = 5*i;
2363cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
23646464896eSShri Abhyankar     for(k=0;k<nz;k++) {
23656464896eSShri Abhyankar       jdx   = 5*vi[k];
2366cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2367cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2368cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2369cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2370cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2371cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2372cee9d6f2SShri Abhyankar       v    += 25;
2373cee9d6f2SShri Abhyankar     }
2374cee9d6f2SShri Abhyankar     x[idx]   = s1;
2375cee9d6f2SShri Abhyankar     x[1+idx] = s2;
2376cee9d6f2SShri Abhyankar     x[2+idx] = s3;
2377cee9d6f2SShri Abhyankar     x[3+idx] = s4;
2378cee9d6f2SShri Abhyankar     x[4+idx] = s5;
2379cee9d6f2SShri Abhyankar   }
2380cee9d6f2SShri Abhyankar 
2381cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
2382cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2383cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
2384cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
2385cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2386cee9d6f2SShri Abhyankar     idt = 5*i;
2387cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
2388cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
23896464896eSShri Abhyankar     for(k=0;k<nz;k++){
23906464896eSShri Abhyankar       idx   = 5*vi[k];
2391cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2392cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2393cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2394cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2395cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2396cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2397cee9d6f2SShri Abhyankar       v    += 25;
2398cee9d6f2SShri Abhyankar     }
2399cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2400cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2401cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2402cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2403cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2404cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2405cee9d6f2SShri Abhyankar   }
2406cee9d6f2SShri Abhyankar 
2407cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2408cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2409cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2410cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2411cee9d6f2SShri Abhyankar }
2412cee9d6f2SShri Abhyankar 
2413cee9d6f2SShri Abhyankar #undef __FUNCT__
241453cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
241553cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
241653cca76cSShri Abhyankar {
241753cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
241853cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
241953cca76cSShri Abhyankar   PetscErrorCode    ierr;
242053cca76cSShri Abhyankar   PetscInt          jdx;
242153cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
242253cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
242353cca76cSShri Abhyankar   const PetscScalar *b;
242453cca76cSShri Abhyankar 
242553cca76cSShri Abhyankar   PetscFunctionBegin;
242653cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
242753cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
242853cca76cSShri Abhyankar   /* forward solve the lower triangular */
242953cca76cSShri Abhyankar   idx    = 0;
243053cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
243153cca76cSShri Abhyankar   for (i=1; i<n; i++) {
243253cca76cSShri Abhyankar     v   = aa + 25*ai[i];
243353cca76cSShri Abhyankar     vi  = aj + ai[i];
243453cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
243553cca76cSShri Abhyankar     idx = 5*i;
243653cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
243753cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
243853cca76cSShri Abhyankar       jdx   = 5*vi[k];
243953cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
244053cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
244153cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
244253cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
244353cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
244453cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
244553cca76cSShri Abhyankar       v    += 25;
244653cca76cSShri Abhyankar     }
244753cca76cSShri Abhyankar     x[idx]   = s1;
244853cca76cSShri Abhyankar     x[1+idx] = s2;
244953cca76cSShri Abhyankar     x[2+idx] = s3;
245053cca76cSShri Abhyankar     x[3+idx] = s4;
245153cca76cSShri Abhyankar     x[4+idx] = s5;
245253cca76cSShri Abhyankar   }
245353cca76cSShri Abhyankar 
245453cca76cSShri Abhyankar   /* backward solve the upper triangular */
245553cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
245653cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
245753cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
245853cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
245953cca76cSShri Abhyankar     idt = 5*i;
246053cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
246153cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
246253cca76cSShri Abhyankar     for(k=0;k<nz;k++){
246353cca76cSShri Abhyankar       idx   = 5*vi[k];
246453cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
246553cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
246653cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
246753cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
246853cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
246953cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
247053cca76cSShri Abhyankar       v    += 25;
247153cca76cSShri Abhyankar     }
247253cca76cSShri Abhyankar     /* x = inv_diagonal*x */
247353cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
247453cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
247553cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
247653cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
247753cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
247853cca76cSShri Abhyankar   }
247953cca76cSShri Abhyankar 
248053cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
248153cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
248253cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
248353cca76cSShri Abhyankar   PetscFunctionReturn(0);
248453cca76cSShri Abhyankar }
248553cca76cSShri Abhyankar 
248653cca76cSShri Abhyankar #undef __FUNCT__
24874a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2488dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
24894e2b4712SSatish Balay {
24904e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
24914e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
24926849ba73SBarry Smith   PetscErrorCode    ierr;
24935d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
24945d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2495d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2496d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2497d9fead3dSBarry Smith   const PetscScalar *b;
24984e2b4712SSatish Balay 
24994e2b4712SSatish Balay   PetscFunctionBegin;
2500d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2502f1af5d2fSBarry Smith   t  = a->solve_work;
25034e2b4712SSatish Balay 
25044e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25054e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25064e2b4712SSatish Balay 
25074e2b4712SSatish Balay   /* forward solve the lower triangular */
25084e2b4712SSatish Balay   idx    = 4*(*r++);
2509f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2510f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
25114e2b4712SSatish Balay   for (i=1; i<n; i++) {
25124e2b4712SSatish Balay     v     = aa + 16*ai[i];
25134e2b4712SSatish Balay     vi    = aj + ai[i];
25144e2b4712SSatish Balay     nz    = diag[i] - ai[i];
25154e2b4712SSatish Balay     idx   = 4*(*r++);
2516f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
25174e2b4712SSatish Balay     while (nz--) {
25184e2b4712SSatish Balay       idx   = 4*(*vi++);
2519f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2520f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2521f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2522f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2523f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
25244e2b4712SSatish Balay       v    += 16;
25254e2b4712SSatish Balay     }
25264e2b4712SSatish Balay     idx        = 4*i;
2527f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2528f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
25294e2b4712SSatish Balay   }
25304e2b4712SSatish Balay   /* backward solve the upper triangular */
25314e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
25324e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
25334e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
25344e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
25354e2b4712SSatish Balay     idt  = 4*i;
2536f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2537f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
25384e2b4712SSatish Balay     while (nz--) {
25394e2b4712SSatish Balay       idx   = 4*(*vi++);
2540f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2541f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2542f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2543f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2544f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2545f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
25464e2b4712SSatish Balay       v += 16;
25474e2b4712SSatish Balay     }
25484e2b4712SSatish Balay     idc      = 4*(*c--);
25494e2b4712SSatish Balay     v        = aa + 16*diag[i];
2550f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2551f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2552f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2553f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
25544e2b4712SSatish Balay   }
25554e2b4712SSatish Balay 
25564e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
25574e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2558d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2560dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
25614e2b4712SSatish Balay   PetscFunctionReturn(0);
25624e2b4712SSatish Balay }
2563f26ec98cSKris Buschelman 
2564f26ec98cSKris Buschelman #undef __FUNCT__
25658f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
25668f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
25678f690400SShri Abhyankar {
25688f690400SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
25698f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
25708f690400SShri Abhyankar   PetscErrorCode    ierr;
257129b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
25728f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
25738f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
25748f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
25758f690400SShri Abhyankar   const PetscScalar *b;
25768f690400SShri Abhyankar 
25778f690400SShri Abhyankar   PetscFunctionBegin;
25788f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25798f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25808f690400SShri Abhyankar   t  = a->solve_work;
25818f690400SShri Abhyankar 
25828f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
258329b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
25848f690400SShri Abhyankar 
25858f690400SShri Abhyankar   /* forward solve the lower triangular */
258629b92fc1SShri Abhyankar   idx    = 4*r[0];
25878f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
25888f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
25898f690400SShri Abhyankar   for (i=1; i<n; i++) {
25908f690400SShri Abhyankar     v     = aa + 16*ai[i];
25918f690400SShri Abhyankar     vi    = aj + ai[i];
25928f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
259329b92fc1SShri Abhyankar     idx   = 4*r[i];
25948f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
259529b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
259629b92fc1SShri Abhyankar       idx   = 4*vi[m];
25978f690400SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
25988f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
25998f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
26008f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
26018f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
26028f690400SShri Abhyankar       v    += 16;
26038f690400SShri Abhyankar     }
26048f690400SShri Abhyankar     idx        = 4*i;
26058f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
26068f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
26078f690400SShri Abhyankar   }
26088f690400SShri Abhyankar   /* backward solve the upper triangular */
26098f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
26108f690400SShri Abhyankar     k    = 2*n-i;
26118f690400SShri Abhyankar     v    = aa + 16*ai[k];
26128f690400SShri Abhyankar     vi   = aj + ai[k];
26138f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
26148f690400SShri Abhyankar     idt  = 4*i;
26158f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
26168f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
261729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
261829b92fc1SShri Abhyankar       idx   = 4*vi[m];
26198f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
26208f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
26218f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
26228f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
26238f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
26248f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
26258f690400SShri Abhyankar       v += 16;
26268f690400SShri Abhyankar     }
262729b92fc1SShri Abhyankar     idc      = 4*c[i];
26288f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
26298f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
26308f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
26318f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
26328f690400SShri Abhyankar   }
26338f690400SShri Abhyankar 
26348f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26358f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
26368f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26378f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
26388f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
26398f690400SShri Abhyankar   PetscFunctionReturn(0);
26408f690400SShri Abhyankar }
26418f690400SShri Abhyankar 
26428f690400SShri Abhyankar #undef __FUNCT__
2643f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2644dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2645f26ec98cSKris Buschelman {
2646f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2647f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
26486849ba73SBarry Smith   PetscErrorCode    ierr;
26495d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
26505d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2651d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2652d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2653d9fead3dSBarry Smith   PetscScalar       *x;
2654d9fead3dSBarry Smith   const PetscScalar *b;
2655f26ec98cSKris Buschelman 
2656f26ec98cSKris Buschelman   PetscFunctionBegin;
2657d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26581ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2659f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2660f26ec98cSKris Buschelman 
2661f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2662f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2663f26ec98cSKris Buschelman 
2664f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2665f26ec98cSKris Buschelman   idx    = 4*(*r++);
2666f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2667f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2668f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2669f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2670f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2671f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2672f26ec98cSKris Buschelman     vi    = aj + ai[i];
2673f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2674f26ec98cSKris Buschelman     idx   = 4*(*r++);
2675f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2676f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2677f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2678f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2679f26ec98cSKris Buschelman     while (nz--) {
2680f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2681f26ec98cSKris Buschelman       x1  = t[idx];
2682f26ec98cSKris Buschelman       x2  = t[1+idx];
2683f26ec98cSKris Buschelman       x3  = t[2+idx];
2684f26ec98cSKris Buschelman       x4  = t[3+idx];
2685f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2686f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2687f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2688f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2689f26ec98cSKris Buschelman       v    += 16;
2690f26ec98cSKris Buschelman     }
2691f26ec98cSKris Buschelman     idx        = 4*i;
2692f26ec98cSKris Buschelman     t[idx]   = s1;
2693f26ec98cSKris Buschelman     t[1+idx] = s2;
2694f26ec98cSKris Buschelman     t[2+idx] = s3;
2695f26ec98cSKris Buschelman     t[3+idx] = s4;
2696f26ec98cSKris Buschelman   }
2697f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2698f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2699f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2700f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2701f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2702f26ec98cSKris Buschelman     idt  = 4*i;
2703f26ec98cSKris Buschelman     s1 = t[idt];
2704f26ec98cSKris Buschelman     s2 = t[1+idt];
2705f26ec98cSKris Buschelman     s3 = t[2+idt];
2706f26ec98cSKris Buschelman     s4 = t[3+idt];
2707f26ec98cSKris Buschelman     while (nz--) {
2708f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2709f26ec98cSKris Buschelman       x1  = t[idx];
2710f26ec98cSKris Buschelman       x2  = t[1+idx];
2711f26ec98cSKris Buschelman       x3  = t[2+idx];
2712f26ec98cSKris Buschelman       x4  = t[3+idx];
2713f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2714f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2715f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2716f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2717f26ec98cSKris Buschelman       v += 16;
2718f26ec98cSKris Buschelman     }
2719f26ec98cSKris Buschelman     idc      = 4*(*c--);
2720f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
2721f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2722f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2723f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2724f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2725f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2726f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2727f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2728f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2729f26ec98cSKris Buschelman  }
2730f26ec98cSKris Buschelman 
2731f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2732f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2733d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27341ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2735dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2736f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2737f26ec98cSKris Buschelman }
2738f26ec98cSKris Buschelman 
273924c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
274024c233c2SKris Buschelman 
274124c233c2SKris Buschelman #include PETSC_HAVE_SSE
274224c233c2SKris Buschelman 
274324c233c2SKris Buschelman #undef __FUNCT__
274424c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2745dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
274624c233c2SKris Buschelman {
274724c233c2SKris Buschelman   /*
274824c233c2SKris Buschelman      Note: This code uses demotion of double
274924c233c2SKris Buschelman      to float when performing the mixed-mode computation.
275024c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
275124c233c2SKris Buschelman   */
275224c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
275324c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
27546849ba73SBarry Smith   PetscErrorCode ierr;
27555d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
27565d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
275724c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
275887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
275924c233c2SKris Buschelman 
276024c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
276124c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
276224c233c2SKris Buschelman   unsigned long   offset;
276324c233c2SKris Buschelman 
276424c233c2SKris Buschelman   PetscFunctionBegin;
276524c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
276624c233c2SKris Buschelman 
276724c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
276824c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
276924c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
277024c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
277124c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
277224c233c2SKris Buschelman 
27731ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
27741ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
277524c233c2SKris Buschelman     t  = a->solve_work;
277624c233c2SKris Buschelman 
277724c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
277824c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
277924c233c2SKris Buschelman 
278024c233c2SKris Buschelman     /* forward solve the lower triangular */
278124c233c2SKris Buschelman     idx  = 4*(*r++);
278224c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
278324c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
278424c233c2SKris Buschelman     v    =  aa + 16*ai[1];
278524c233c2SKris Buschelman 
278624c233c2SKris Buschelman     for (i=1; i<n;) {
278724c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
278824c233c2SKris Buschelman       vi   =  aj      + ai[i];
278924c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
279024c233c2SKris Buschelman       idx  =  4*(*r++);
279124c233c2SKris Buschelman 
279224c233c2SKris Buschelman       /* Demote sum from double to float */
279324c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
279424c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
279524c233c2SKris Buschelman 
279624c233c2SKris Buschelman       while (nz--) {
279724c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
279824c233c2SKris Buschelman         idx = 4*(*vi++);
279924c233c2SKris Buschelman 
280024c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
280124c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
280224c233c2SKris Buschelman 
280324c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
280424c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
280524c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
280624c233c2SKris Buschelman 
280724c233c2SKris Buschelman           /* First Column */
280824c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
280924c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
281024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
281124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
281224c233c2SKris Buschelman 
281324c233c2SKris Buschelman           /* Second Column */
281424c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
281524c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
281624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
281724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
281824c233c2SKris Buschelman 
281924c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
282024c233c2SKris Buschelman 
282124c233c2SKris Buschelman           /* Third Column */
282224c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
282324c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
282424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
282524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
282624c233c2SKris Buschelman 
282724c233c2SKris Buschelman           /* Fourth Column */
282824c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
282924c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
283024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
283124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
283224c233c2SKris Buschelman         SSE_INLINE_END_2
283324c233c2SKris Buschelman 
283424c233c2SKris Buschelman         v  += 16;
283524c233c2SKris Buschelman       }
283624c233c2SKris Buschelman       idx = 4*i;
283724c233c2SKris Buschelman       v   = aa + 16*ai[++i];
283824c233c2SKris Buschelman       PREFETCH_NTA(v);
283924c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
284024c233c2SKris Buschelman 
284124c233c2SKris Buschelman       /* Promote result from float to double */
284224c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
284324c233c2SKris Buschelman     }
284424c233c2SKris Buschelman     /* backward solve the upper triangular */
284524c233c2SKris Buschelman     idt  = 4*(n-1);
284624c233c2SKris Buschelman     ai16 = 16*diag[n-1];
284724c233c2SKris Buschelman     v    = aa + ai16 + 16;
284824c233c2SKris Buschelman     for (i=n-1; i>=0;){
284924c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
285024c233c2SKris Buschelman       vi = aj + diag[i] + 1;
285124c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
285224c233c2SKris Buschelman 
285324c233c2SKris Buschelman       /* Demote accumulator from double to float */
285424c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
285524c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
285624c233c2SKris Buschelman 
285724c233c2SKris Buschelman       while (nz--) {
285824c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
285924c233c2SKris Buschelman         idx = 4*(*vi++);
286024c233c2SKris Buschelman 
286124c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
286224c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
286324c233c2SKris Buschelman 
286424c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
286524c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
286624c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
286724c233c2SKris Buschelman 
286824c233c2SKris Buschelman           /* First Column */
286924c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
287024c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
287124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
287224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
287324c233c2SKris Buschelman 
287424c233c2SKris Buschelman           /* Second Column */
287524c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
287624c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
287724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
287824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
287924c233c2SKris Buschelman 
288024c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
288124c233c2SKris Buschelman 
288224c233c2SKris Buschelman           /* Third Column */
288324c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
288424c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
288524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
288624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
288724c233c2SKris Buschelman 
288824c233c2SKris Buschelman           /* Fourth Column */
288924c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
289024c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
289124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
289224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
289324c233c2SKris Buschelman         SSE_INLINE_END_2
289424c233c2SKris Buschelman         v  += 16;
289524c233c2SKris Buschelman       }
289624c233c2SKris Buschelman       v    = aa + ai16;
289724c233c2SKris Buschelman       ai16 = 16*diag[--i];
289824c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
289924c233c2SKris Buschelman       /*
290024c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
290124c233c2SKris Buschelman          which was inverted as part of the factorization
290224c233c2SKris Buschelman       */
290324c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
290424c233c2SKris Buschelman         /* First Column */
290524c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
290624c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
290724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
290824c233c2SKris Buschelman 
290924c233c2SKris Buschelman         /* Second Column */
291024c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
291124c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
291224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
291324c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
291424c233c2SKris Buschelman 
291524c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
291624c233c2SKris Buschelman 
291724c233c2SKris Buschelman         /* Third Column */
291824c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
291924c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
292024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
292124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
292224c233c2SKris Buschelman 
292324c233c2SKris Buschelman         /* Fourth Column */
292424c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
292524c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
292624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
292724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
292824c233c2SKris Buschelman 
292924c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
293024c233c2SKris Buschelman       SSE_INLINE_END_3
293124c233c2SKris Buschelman 
293224c233c2SKris Buschelman       /* Promote solution from float to double */
293324c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
293424c233c2SKris Buschelman 
293524c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
293624c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
293724c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
293824c233c2SKris Buschelman       idc  = 4*(*c--);
293924c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
294024c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
294124c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
294224c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
294324c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
294424c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
294524c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
294624c233c2SKris Buschelman       SSE_INLINE_END_2
294724c233c2SKris Buschelman       v    = aa + ai16 + 16;
294824c233c2SKris Buschelman       idt -= 4;
294924c233c2SKris Buschelman     }
295024c233c2SKris Buschelman 
295124c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
295224c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
29531ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
29541ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2955dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
295624c233c2SKris Buschelman   SSE_SCOPE_END;
295724c233c2SKris Buschelman   PetscFunctionReturn(0);
295824c233c2SKris Buschelman }
295924c233c2SKris Buschelman 
296024c233c2SKris Buschelman #endif
29610ef38995SBarry Smith 
29620ef38995SBarry Smith 
29634e2b4712SSatish Balay /*
29644e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
29654e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
29664e2b4712SSatish Balay */
29674a2ae208SSatish Balay #undef __FUNCT__
29684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2969dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
29704e2b4712SSatish Balay {
29714e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2972356650c2SBarry Smith   PetscInt          n=a->mbs;
2973356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2974dfbe8321SBarry Smith   PetscErrorCode    ierr;
2975356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2976d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2977d9fead3dSBarry Smith   PetscScalar       *x;
2978d9fead3dSBarry Smith   const PetscScalar *b;
29794e2b4712SSatish Balay 
29804e2b4712SSatish Balay   PetscFunctionBegin;
2981d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
29834e2b4712SSatish Balay 
2984aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
29852853dc0eSBarry Smith   {
298687828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
29872853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
29882853dc0eSBarry Smith   }
2989aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
29902853dc0eSBarry Smith   {
299187828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
29922853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
29932853dc0eSBarry Smith   }
2994aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
29952853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2996e1293385SBarry Smith #else
299730d4dcafSBarry Smith   {
299887828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2999d9fead3dSBarry Smith     const MatScalar *v;
3000356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
3001356650c2SBarry Smith     const PetscInt  *vi;
3002e1293385SBarry Smith 
30034e2b4712SSatish Balay   /* forward solve the lower triangular */
30044e2b4712SSatish Balay   idx    = 0;
3005e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
30064e2b4712SSatish Balay   for (i=1; i<n; i++) {
30074e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
30084e2b4712SSatish Balay     vi    =  aj      + ai[i];
30094e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3010e1293385SBarry Smith     idx   +=  4;
3011f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
30124e2b4712SSatish Balay     while (nz--) {
30134e2b4712SSatish Balay       jdx   = 4*(*vi++);
30144e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3015f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3016f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3017f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3018f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
30194e2b4712SSatish Balay       v    += 16;
30204e2b4712SSatish Balay     }
3021f1af5d2fSBarry Smith     x[idx]   = s1;
3022f1af5d2fSBarry Smith     x[1+idx] = s2;
3023f1af5d2fSBarry Smith     x[2+idx] = s3;
3024f1af5d2fSBarry Smith     x[3+idx] = s4;
30254e2b4712SSatish Balay   }
30264e2b4712SSatish Balay   /* backward solve the upper triangular */
30274e555682SBarry Smith   idt = 4*(n-1);
30284e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30294e555682SBarry Smith     ai16 = 16*diag[i];
30304e555682SBarry Smith     v    = aa + ai16 + 16;
30314e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30324e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3033f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3034f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
30354e2b4712SSatish Balay     while (nz--) {
30364e2b4712SSatish Balay       idx   = 4*(*vi++);
30374e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3038f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3039f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3040f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3041f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
30424e2b4712SSatish Balay       v    += 16;
30434e2b4712SSatish Balay     }
30444e555682SBarry Smith     v        = aa + ai16;
3045f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3046f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3047f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3048f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3049329f5518SBarry Smith     idt -= 4;
30504e2b4712SSatish Balay   }
305130d4dcafSBarry Smith   }
3052e1293385SBarry Smith #endif
30534e2b4712SSatish Balay 
3054d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3056dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
30574e2b4712SSatish Balay   PetscFunctionReturn(0);
30584e2b4712SSatish Balay }
30594e2b4712SSatish Balay 
3060f26ec98cSKris Buschelman #undef __FUNCT__
3061cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3062cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3063cee9d6f2SShri Abhyankar {
3064cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
30656464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3066cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3067cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3068cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3069cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3070cee9d6f2SShri Abhyankar     PetscScalar       *x;
3071cee9d6f2SShri Abhyankar     const PetscScalar *b;
3072cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3073cee9d6f2SShri Abhyankar 
3074cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3075cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3076cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3077cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3078cee9d6f2SShri Abhyankar     idx    = 0;
3079cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3080cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3081cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3082cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3083cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3084cee9d6f2SShri Abhyankar       idx   = bs*i;
3085cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
30866464896eSShri Abhyankar       for(k=0;k<nz;k++) {
30876464896eSShri Abhyankar           jdx   = bs*vi[k];
3088cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3089cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3090cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3091cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3092cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3093cee9d6f2SShri Abhyankar 
3094cee9d6f2SShri Abhyankar           v   +=  bs2;
3095cee9d6f2SShri Abhyankar         }
3096cee9d6f2SShri Abhyankar 
3097cee9d6f2SShri Abhyankar        x[idx]   = s1;
3098cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3099cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3100cee9d6f2SShri Abhyankar        x[3+idx] = s4;
3101cee9d6f2SShri Abhyankar     }
3102cee9d6f2SShri Abhyankar 
3103cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3104cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3105cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3106cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3107cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3108cee9d6f2SShri Abhyankar      idt = bs*i;
3109cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3110cee9d6f2SShri Abhyankar 
31116464896eSShri Abhyankar     for(k=0;k<nz;k++){
31126464896eSShri Abhyankar       idx   = bs*vi[k];
3113cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3114cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3115cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3116cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3117cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3118cee9d6f2SShri Abhyankar 
3119cee9d6f2SShri Abhyankar         v   +=  bs2;
3120cee9d6f2SShri Abhyankar     }
3121cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3122cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3123cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3124cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3125cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3126cee9d6f2SShri Abhyankar 
3127cee9d6f2SShri Abhyankar   }
3128cee9d6f2SShri Abhyankar 
3129cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3130cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3131cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3132cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3133cee9d6f2SShri Abhyankar }
3134cee9d6f2SShri Abhyankar 
3135b2b2dd24SShri Abhyankar #undef __FUNCT__
3136b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3137b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3138b2b2dd24SShri Abhyankar {
3139b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3140b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3141b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3142b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3143b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3144b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3145b2b2dd24SShri Abhyankar     PetscScalar       *x;
3146b2b2dd24SShri Abhyankar     const PetscScalar *b;
3147b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3148cee9d6f2SShri Abhyankar 
3149b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3150b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3151b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3152b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3153b2b2dd24SShri Abhyankar     idx    = 0;
3154b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3155b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3156b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3157b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3158b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3159b2b2dd24SShri Abhyankar       idx   = bs*i;
3160b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3161b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
3162b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
3163b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3164b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3165b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3166b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3167b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3168b2b2dd24SShri Abhyankar 
3169b2b2dd24SShri Abhyankar           v   +=  bs2;
3170b2b2dd24SShri Abhyankar         }
3171b2b2dd24SShri Abhyankar 
3172b2b2dd24SShri Abhyankar        x[idx]   = s1;
3173b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3174b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3175b2b2dd24SShri Abhyankar        x[3+idx] = s4;
3176b2b2dd24SShri Abhyankar     }
3177b2b2dd24SShri Abhyankar 
3178b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3179b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3180b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3181b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3182b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3183b2b2dd24SShri Abhyankar      idt = bs*i;
3184b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3185b2b2dd24SShri Abhyankar 
3186b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
3187b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
3188b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3189b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3190b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3191b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3192b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3193b2b2dd24SShri Abhyankar 
3194b2b2dd24SShri Abhyankar         v   +=  bs2;
3195b2b2dd24SShri Abhyankar     }
3196b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3197b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3198b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3199b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3200b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3201b2b2dd24SShri Abhyankar 
3202b2b2dd24SShri Abhyankar   }
3203b2b2dd24SShri Abhyankar 
3204b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3205b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3206b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3207b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3208b2b2dd24SShri Abhyankar }
3209cee9d6f2SShri Abhyankar 
3210cee9d6f2SShri Abhyankar #undef __FUNCT__
3211f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3212dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3213f26ec98cSKris Buschelman {
3214f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3215690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3216dfbe8321SBarry Smith   PetscErrorCode ierr;
3217690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3218f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3219f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3220f26ec98cSKris Buschelman 
3221f26ec98cSKris Buschelman   PetscFunctionBegin;
32221ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
32231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3224f26ec98cSKris Buschelman 
3225f26ec98cSKris Buschelman   {
3226f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3227f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3228690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3229f26ec98cSKris Buschelman 
3230f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3231f26ec98cSKris Buschelman     idx  = 0;
3232f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3233f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3234f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3235f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3236f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3237f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3238f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3239f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3240f26ec98cSKris Buschelman       idx   +=  4;
3241f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3242f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3243f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3244f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3245f26ec98cSKris Buschelman       while (nz--) {
3246f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3247f26ec98cSKris Buschelman         x1  = t[jdx];
3248f26ec98cSKris Buschelman         x2  = t[1+jdx];
3249f26ec98cSKris Buschelman         x3  = t[2+jdx];
3250f26ec98cSKris Buschelman         x4  = t[3+jdx];
3251f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3252f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3253f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3254f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3255f26ec98cSKris Buschelman         v    += 16;
3256f26ec98cSKris Buschelman       }
3257f26ec98cSKris Buschelman       t[idx]   = s1;
3258f26ec98cSKris Buschelman       t[1+idx] = s2;
3259f26ec98cSKris Buschelman       t[2+idx] = s3;
3260f26ec98cSKris Buschelman       t[3+idx] = s4;
3261f26ec98cSKris Buschelman     }
3262f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3263f26ec98cSKris Buschelman     idt = 4*(n-1);
3264f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3265f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3266f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3267f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3268f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3269f26ec98cSKris Buschelman       s1   = t[idt];
3270f26ec98cSKris Buschelman       s2   = t[1+idt];
3271f26ec98cSKris Buschelman       s3   = t[2+idt];
3272f26ec98cSKris Buschelman       s4   = t[3+idt];
3273f26ec98cSKris Buschelman       while (nz--) {
3274f26ec98cSKris Buschelman         idx = 4*(*vi++);
3275f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3276f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3277f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3278f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3279f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3280f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3281f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3282f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3283f26ec98cSKris Buschelman         v    += 16;
3284f26ec98cSKris Buschelman       }
3285f26ec98cSKris Buschelman       v        = aa + ai16;
3286f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3287f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3288f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3289f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3290f26ec98cSKris Buschelman       idt -= 4;
3291f26ec98cSKris Buschelman     }
3292f26ec98cSKris Buschelman   }
3293f26ec98cSKris Buschelman 
32941ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
32951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3296dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3297f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3298f26ec98cSKris Buschelman }
3299f26ec98cSKris Buschelman 
33003660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
33013660e330SKris Buschelman 
33023660e330SKris Buschelman #include PETSC_HAVE_SSE
33033660e330SKris Buschelman #undef __FUNCT__
33047cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3305dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
33063660e330SKris Buschelman {
33073660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
33082aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3309dfbe8321SBarry Smith   PetscErrorCode ierr;
3310dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
33113660e330SKris Buschelman   MatScalar      *aa=a->a;
331287828ca2SBarry Smith   PetscScalar    *x,*b;
33133660e330SKris Buschelman 
33143660e330SKris Buschelman   PetscFunctionBegin;
33153660e330SKris Buschelman   SSE_SCOPE_BEGIN;
33163660e330SKris Buschelman   /*
33173660e330SKris Buschelman      Note: This code currently uses demotion of double
33183660e330SKris Buschelman      to float when performing the mixed-mode computation.
33193660e330SKris Buschelman      This may not be numerically reasonable for all applications.
33203660e330SKris Buschelman   */
33213660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
33223660e330SKris Buschelman 
33231ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
33241ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
33253660e330SKris Buschelman   {
3326eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3327eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
33282aa5897fSKris Buschelman     int            nz,i,idt,ai16;
33292aa5897fSKris Buschelman     unsigned int   jdx,idx;
33302aa5897fSKris Buschelman     unsigned short *vi;
3331eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
33323660e330SKris Buschelman 
3333eb05f457SKris Buschelman     /* First block is the identity. */
33343660e330SKris Buschelman     idx  = 0;
3335eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
33362aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
33373660e330SKris Buschelman 
33383660e330SKris Buschelman     for (i=1; i<n;) {
33393660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
33403660e330SKris Buschelman       vi   =  aj      + ai[i];
33413660e330SKris Buschelman       nz   =  diag[i] - ai[i];
33423660e330SKris Buschelman       idx +=  4;
33433660e330SKris Buschelman 
3344eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3345eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3346eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
33473660e330SKris Buschelman 
33483660e330SKris Buschelman       while (nz--) {
33493660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
33502aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
33513660e330SKris Buschelman 
33523660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3353eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
33543660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
33553660e330SKris Buschelman 
33563660e330SKris Buschelman           /* First Column */
33573660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
33583660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
33593660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
33603660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
33613660e330SKris Buschelman 
33623660e330SKris Buschelman           /* Second Column */
33633660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
33643660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
33653660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
33663660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
33673660e330SKris Buschelman 
33683660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
33693660e330SKris Buschelman 
33703660e330SKris Buschelman           /* Third Column */
33713660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
33723660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
33733660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
33743660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
33753660e330SKris Buschelman 
33763660e330SKris Buschelman           /* Fourth Column */
33773660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
33783660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
33793660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
33803660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
33813660e330SKris Buschelman         SSE_INLINE_END_2
33823660e330SKris Buschelman 
33833660e330SKris Buschelman         v  += 16;
33843660e330SKris Buschelman       }
33853660e330SKris Buschelman       v    =  aa + 16*ai[++i];
33863660e330SKris Buschelman       PREFETCH_NTA(v);
3387eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
33883660e330SKris Buschelman     }
3389eb05f457SKris Buschelman 
3390eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3391eb05f457SKris Buschelman 
33923660e330SKris Buschelman     idt  = 4*(n-1);
33933660e330SKris Buschelman     ai16 = 16*diag[n-1];
33943660e330SKris Buschelman     v    = aa + ai16 + 16;
33953660e330SKris Buschelman     for (i=n-1; i>=0;){
33963660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
33973660e330SKris Buschelman       vi = aj + diag[i] + 1;
33983660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
33993660e330SKris Buschelman 
3400eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
34013660e330SKris Buschelman 
34023660e330SKris Buschelman       while (nz--) {
34033660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
34042aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
34053660e330SKris Buschelman 
34063660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3407eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
34083660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
34093660e330SKris Buschelman 
34103660e330SKris Buschelman           /* First Column */
34113660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
34123660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
34133660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
34143660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
34153660e330SKris Buschelman 
34163660e330SKris Buschelman           /* Second Column */
34173660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
34183660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
34193660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
34203660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
34213660e330SKris Buschelman 
34223660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
34233660e330SKris Buschelman 
34243660e330SKris Buschelman           /* Third Column */
34253660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
34263660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
34273660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
34283660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
34293660e330SKris Buschelman 
34303660e330SKris Buschelman           /* Fourth Column */
34313660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
34323660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
34333660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
34343660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
34353660e330SKris Buschelman         SSE_INLINE_END_2
34363660e330SKris Buschelman         v  += 16;
34373660e330SKris Buschelman       }
34383660e330SKris Buschelman       v    = aa + ai16;
34393660e330SKris Buschelman       ai16 = 16*diag[--i];
34403660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
34413660e330SKris Buschelman       /*
34423660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
34433660e330SKris Buschelman          which was inverted as part of the factorization
34443660e330SKris Buschelman       */
3445eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
34463660e330SKris Buschelman         /* First Column */
34473660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
34483660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
34493660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
34503660e330SKris Buschelman 
34513660e330SKris Buschelman         /* Second Column */
34523660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
34533660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
34543660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
34553660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
34563660e330SKris Buschelman 
34573660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
34583660e330SKris Buschelman 
34593660e330SKris Buschelman         /* Third Column */
34603660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
34613660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
34623660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
34633660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
34643660e330SKris Buschelman 
34653660e330SKris Buschelman         /* Fourth Column */
34663660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
34673660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
34683660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
34693660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
34703660e330SKris Buschelman 
34713660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
34723660e330SKris Buschelman       SSE_INLINE_END_3
34733660e330SKris Buschelman 
34743660e330SKris Buschelman       v    = aa + ai16 + 16;
34753660e330SKris Buschelman       idt -= 4;
34763660e330SKris Buschelman     }
3477eb05f457SKris Buschelman 
3478eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3479eb05f457SKris Buschelman     idt = 4*(n-1);
3480eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3481eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3482eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3483eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3484eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3485eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3486eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3487eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3488eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
348954693613SKris Buschelman       idt -= 4;
34903660e330SKris Buschelman     }
3491eb05f457SKris Buschelman 
3492eb05f457SKris Buschelman   } /* End of artificial scope. */
34931ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
34941ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3495dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
34963660e330SKris Buschelman   SSE_SCOPE_END;
34973660e330SKris Buschelman   PetscFunctionReturn(0);
34983660e330SKris Buschelman }
34993660e330SKris Buschelman 
35007cf1b8d3SKris Buschelman #undef __FUNCT__
35017cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3502dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
35037cf1b8d3SKris Buschelman {
35047cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
35057cf1b8d3SKris Buschelman   int            *aj=a->j;
3506dfbe8321SBarry Smith   PetscErrorCode ierr;
3507dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
35087cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
35097cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
35107cf1b8d3SKris Buschelman 
35117cf1b8d3SKris Buschelman   PetscFunctionBegin;
35127cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
35137cf1b8d3SKris Buschelman   /*
35147cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
35157cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
35167cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
35177cf1b8d3SKris Buschelman   */
35187cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
35197cf1b8d3SKris Buschelman 
35201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
35211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
35227cf1b8d3SKris Buschelman   {
35237cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
35247cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
35257cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
35267cf1b8d3SKris Buschelman     int       jdx,idx;
35277cf1b8d3SKris Buschelman     int       *vi;
35287cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
35297cf1b8d3SKris Buschelman 
35307cf1b8d3SKris Buschelman     /* First block is the identity. */
35317cf1b8d3SKris Buschelman     idx  = 0;
35327cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
35337cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
35347cf1b8d3SKris Buschelman 
35357cf1b8d3SKris Buschelman     for (i=1; i<n;) {
35367cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
35377cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
35387cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
35397cf1b8d3SKris Buschelman       idx +=  4;
35407cf1b8d3SKris Buschelman 
35417cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
35427cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
35437cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
35447cf1b8d3SKris Buschelman 
35457cf1b8d3SKris Buschelman       while (nz--) {
35467cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
35477cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
35487cf1b8d3SKris Buschelman /*          jdx = *vi++; */
35497cf1b8d3SKris Buschelman 
35507cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
35517cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
35527cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
35537cf1b8d3SKris Buschelman 
35547cf1b8d3SKris Buschelman           /* First Column */
35557cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
35567cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
35577cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
35587cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
35597cf1b8d3SKris Buschelman 
35607cf1b8d3SKris Buschelman           /* Second Column */
35617cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
35627cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
35637cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
35647cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
35657cf1b8d3SKris Buschelman 
35667cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
35677cf1b8d3SKris Buschelman 
35687cf1b8d3SKris Buschelman           /* Third Column */
35697cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
35707cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
35717cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
35727cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
35737cf1b8d3SKris Buschelman 
35747cf1b8d3SKris Buschelman           /* Fourth Column */
35757cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
35767cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
35777cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
35787cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
35797cf1b8d3SKris Buschelman         SSE_INLINE_END_2
35807cf1b8d3SKris Buschelman 
35817cf1b8d3SKris Buschelman         v  += 16;
35827cf1b8d3SKris Buschelman       }
35837cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
35847cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
35857cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
35867cf1b8d3SKris Buschelman     }
35877cf1b8d3SKris Buschelman 
35887cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
35897cf1b8d3SKris Buschelman 
35907cf1b8d3SKris Buschelman     idt  = 4*(n-1);
35917cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
35927cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
35937cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
35947cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
35957cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
35967cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
35977cf1b8d3SKris Buschelman 
35987cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
35997cf1b8d3SKris Buschelman 
36007cf1b8d3SKris Buschelman       while (nz--) {
36017cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
36027cf1b8d3SKris Buschelman         idx = 4*(*vi++);
36037cf1b8d3SKris Buschelman /*          idx = *vi++; */
36047cf1b8d3SKris Buschelman 
36057cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
36067cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
36077cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
36087cf1b8d3SKris Buschelman 
36097cf1b8d3SKris Buschelman           /* First Column */
36107cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
36117cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
36127cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
36137cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
36147cf1b8d3SKris Buschelman 
36157cf1b8d3SKris Buschelman           /* Second Column */
36167cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
36177cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
36187cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
36197cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
36207cf1b8d3SKris Buschelman 
36217cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
36227cf1b8d3SKris Buschelman 
36237cf1b8d3SKris Buschelman           /* Third Column */
36247cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
36257cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
36267cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
36277cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
36287cf1b8d3SKris Buschelman 
36297cf1b8d3SKris Buschelman           /* Fourth Column */
36307cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
36317cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
36327cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
36337cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
36347cf1b8d3SKris Buschelman         SSE_INLINE_END_2
36357cf1b8d3SKris Buschelman         v  += 16;
36367cf1b8d3SKris Buschelman       }
36377cf1b8d3SKris Buschelman       v    = aa + ai16;
36387cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
36397cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
36407cf1b8d3SKris Buschelman       /*
36417cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
36427cf1b8d3SKris Buschelman          which was inverted as part of the factorization
36437cf1b8d3SKris Buschelman       */
36447cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
36457cf1b8d3SKris Buschelman         /* First Column */
36467cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
36477cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
36487cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
36497cf1b8d3SKris Buschelman 
36507cf1b8d3SKris Buschelman         /* Second Column */
36517cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
36527cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
36537cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
36547cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
36557cf1b8d3SKris Buschelman 
36567cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
36577cf1b8d3SKris Buschelman 
36587cf1b8d3SKris Buschelman         /* Third Column */
36597cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
36607cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
36617cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
36627cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
36637cf1b8d3SKris Buschelman 
36647cf1b8d3SKris Buschelman         /* Fourth Column */
36657cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
36667cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
36677cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
36687cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
36697cf1b8d3SKris Buschelman 
36707cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
36717cf1b8d3SKris Buschelman       SSE_INLINE_END_3
36727cf1b8d3SKris Buschelman 
36737cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
36747cf1b8d3SKris Buschelman       idt -= 4;
36757cf1b8d3SKris Buschelman     }
36767cf1b8d3SKris Buschelman 
36777cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
36787cf1b8d3SKris Buschelman     idt = 4*(n-1);
36797cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
36807cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
36817cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
36827cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
36837cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
36847cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
36857cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
36867cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
36877cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
36887cf1b8d3SKris Buschelman       idt -= 4;
36897cf1b8d3SKris Buschelman     }
36907cf1b8d3SKris Buschelman 
36917cf1b8d3SKris Buschelman   } /* End of artificial scope. */
36921ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
36931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3694dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
36957cf1b8d3SKris Buschelman   SSE_SCOPE_END;
36967cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
36977cf1b8d3SKris Buschelman }
36987cf1b8d3SKris Buschelman 
36993660e330SKris Buschelman #endif
37008f690400SShri Abhyankar 
37014a2ae208SSatish Balay #undef __FUNCT__
37024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3703dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
37044e2b4712SSatish Balay {
37054e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
37064e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
37076849ba73SBarry Smith   PetscErrorCode    ierr;
37085d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
37095d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3710d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3711d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3712d9fead3dSBarry Smith   const PetscScalar *b;
37134e2b4712SSatish Balay 
37144e2b4712SSatish Balay   PetscFunctionBegin;
3715d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
37161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3717f1af5d2fSBarry Smith   t  = a->solve_work;
37184e2b4712SSatish Balay 
37194e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
37204e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
37214e2b4712SSatish Balay 
37224e2b4712SSatish Balay   /* forward solve the lower triangular */
37234e2b4712SSatish Balay   idx    = 3*(*r++);
3724f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
37254e2b4712SSatish Balay   for (i=1; i<n; i++) {
37264e2b4712SSatish Balay     v     = aa + 9*ai[i];
37274e2b4712SSatish Balay     vi    = aj + ai[i];
37284e2b4712SSatish Balay     nz    = diag[i] - ai[i];
37294e2b4712SSatish Balay     idx   = 3*(*r++);
3730f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
37314e2b4712SSatish Balay     while (nz--) {
37324e2b4712SSatish Balay       idx   = 3*(*vi++);
3733f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3734f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3735f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3736f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
37374e2b4712SSatish Balay       v += 9;
37384e2b4712SSatish Balay     }
37394e2b4712SSatish Balay     idx = 3*i;
3740f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
37414e2b4712SSatish Balay   }
37424e2b4712SSatish Balay   /* backward solve the upper triangular */
37434e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
37444e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
37454e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
37464e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
37474e2b4712SSatish Balay     idt  = 3*i;
3748f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
37494e2b4712SSatish Balay     while (nz--) {
37504e2b4712SSatish Balay       idx   = 3*(*vi++);
3751f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3752f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3753f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3754f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
37554e2b4712SSatish Balay       v += 9;
37564e2b4712SSatish Balay     }
37574e2b4712SSatish Balay     idc = 3*(*c--);
37584e2b4712SSatish Balay     v   = aa + 9*diag[i];
3759f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3760f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3761f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
37624e2b4712SSatish Balay   }
37634e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
37644e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3765d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
37661ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3767dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
37684e2b4712SSatish Balay   PetscFunctionReturn(0);
37694e2b4712SSatish Balay }
37704e2b4712SSatish Balay 
37718f690400SShri Abhyankar #undef __FUNCT__
37728f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
37738f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
37748f690400SShri Abhyankar {
37758f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
37768f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
37778f690400SShri Abhyankar   PetscErrorCode    ierr;
377829b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
37798f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
37808f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
37818f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
37828f690400SShri Abhyankar   const PetscScalar *b;
37838f690400SShri Abhyankar 
37848f690400SShri Abhyankar   PetscFunctionBegin;
37858f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
37868f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
37878f690400SShri Abhyankar   t  = a->solve_work;
37888f690400SShri Abhyankar 
37898f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
379029b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
37918f690400SShri Abhyankar 
37928f690400SShri Abhyankar   /* forward solve the lower triangular */
379329b92fc1SShri Abhyankar   idx    = 3*r[0];
37948f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
37958f690400SShri Abhyankar   for (i=1; i<n; i++) {
37968f690400SShri Abhyankar     v     = aa + 9*ai[i];
37978f690400SShri Abhyankar     vi    = aj + ai[i];
37988f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
379929b92fc1SShri Abhyankar     idx   = 3*r[i];
38008f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
380129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
380229b92fc1SShri Abhyankar       idx   = 3*vi[m];
38038f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
38048f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
38058f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
38068f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
38078f690400SShri Abhyankar       v += 9;
38088f690400SShri Abhyankar     }
38098f690400SShri Abhyankar     idx = 3*i;
38108f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
38118f690400SShri Abhyankar   }
38128f690400SShri Abhyankar   /* backward solve the upper triangular */
38138f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
38148f690400SShri Abhyankar     k    = 2*n-i;
38158f690400SShri Abhyankar     v    = aa + 9*ai[k];
38168f690400SShri Abhyankar     vi   = aj + ai[k];
38178f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
38188f690400SShri Abhyankar     idt  = 3*i;
38198f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
382029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
382129b92fc1SShri Abhyankar       idx   = 3*vi[m];
38228f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
38238f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
38248f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
38258f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
38268f690400SShri Abhyankar       v += 9;
38278f690400SShri Abhyankar     }
382829b92fc1SShri Abhyankar     idc = 3*c[i];
38298f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
38308f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
38318f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
38328f690400SShri Abhyankar   }
38338f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
38348f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
38358f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38368f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
38378f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
38388f690400SShri Abhyankar   PetscFunctionReturn(0);
38398f690400SShri Abhyankar }
38408f690400SShri Abhyankar 
3841*0c4413a7SShri Abhyankar #undef __FUNCT__
3842*0c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
3843*0c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3844*0c4413a7SShri Abhyankar {
3845*0c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3846*0c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
3847*0c4413a7SShri Abhyankar   PetscErrorCode    ierr;
3848*0c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
3849*0c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3850*0c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
3851*0c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3852*0c4413a7SShri Abhyankar   const PetscScalar *b;
3853*0c4413a7SShri Abhyankar 
3854*0c4413a7SShri Abhyankar   PetscFunctionBegin;
3855*0c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3856*0c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3857*0c4413a7SShri Abhyankar   t  = a->solve_work;
3858*0c4413a7SShri Abhyankar 
3859*0c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3860*0c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
3861*0c4413a7SShri Abhyankar 
3862*0c4413a7SShri Abhyankar   /* forward solve the lower triangular */
3863*0c4413a7SShri Abhyankar   idx    = 3*r[0];
3864*0c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
3865*0c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
3866*0c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
3867*0c4413a7SShri Abhyankar     vi    = aj + ai[i];
3868*0c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
3869*0c4413a7SShri Abhyankar     idx   = 3*r[i];
3870*0c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
3871*0c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
3872*0c4413a7SShri Abhyankar       idx   = 3*vi[m];
3873*0c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3874*0c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3875*0c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3876*0c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3877*0c4413a7SShri Abhyankar       v += 9;
3878*0c4413a7SShri Abhyankar     }
3879*0c4413a7SShri Abhyankar     idx = 3*i;
3880*0c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
3881*0c4413a7SShri Abhyankar   }
3882*0c4413a7SShri Abhyankar   /* backward solve the upper triangular */
3883*0c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
3884*0c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
3885*0c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
3886*0c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
3887*0c4413a7SShri Abhyankar     idt  = 3*i;
3888*0c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
3889*0c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
3890*0c4413a7SShri Abhyankar       idx   = 3*vi[m];
3891*0c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3892*0c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3893*0c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3894*0c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3895*0c4413a7SShri Abhyankar       v += 9;
3896*0c4413a7SShri Abhyankar     }
3897*0c4413a7SShri Abhyankar     idc = 3*c[i];
3898*0c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3899*0c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3900*0c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3901*0c4413a7SShri Abhyankar   }
3902*0c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3903*0c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3904*0c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3905*0c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3906*0c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
3907*0c4413a7SShri Abhyankar   PetscFunctionReturn(0);
3908*0c4413a7SShri Abhyankar }
3909*0c4413a7SShri Abhyankar 
391015091d37SBarry Smith /*
391115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
391215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
391315091d37SBarry Smith */
39144a2ae208SSatish Balay #undef __FUNCT__
39154a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3916dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
391715091d37SBarry Smith {
391815091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3919690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3920dfbe8321SBarry Smith   PetscErrorCode    ierr;
3921690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3922d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3923d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3924d9fead3dSBarry Smith   const PetscScalar *b;
3925690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
392615091d37SBarry Smith 
392715091d37SBarry Smith   PetscFunctionBegin;
3928d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39291ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393015091d37SBarry Smith 
393115091d37SBarry Smith   /* forward solve the lower triangular */
393215091d37SBarry Smith   idx    = 0;
393315091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
393415091d37SBarry Smith   for (i=1; i<n; i++) {
393515091d37SBarry Smith     v     =  aa      + 9*ai[i];
393615091d37SBarry Smith     vi    =  aj      + ai[i];
393715091d37SBarry Smith     nz    =  diag[i] - ai[i];
393815091d37SBarry Smith     idx   +=  3;
3939f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
394015091d37SBarry Smith     while (nz--) {
394115091d37SBarry Smith       jdx   = 3*(*vi++);
394215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3943f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3944f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3945f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
394615091d37SBarry Smith       v    += 9;
394715091d37SBarry Smith     }
3948f1af5d2fSBarry Smith     x[idx]   = s1;
3949f1af5d2fSBarry Smith     x[1+idx] = s2;
3950f1af5d2fSBarry Smith     x[2+idx] = s3;
395115091d37SBarry Smith   }
395215091d37SBarry Smith   /* backward solve the upper triangular */
395315091d37SBarry Smith   for (i=n-1; i>=0; i--){
395415091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
395515091d37SBarry Smith     vi   = aj + diag[i] + 1;
395615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
395715091d37SBarry Smith     idt  = 3*i;
3958f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3959f1af5d2fSBarry Smith     s3 = x[2+idt];
396015091d37SBarry Smith     while (nz--) {
396115091d37SBarry Smith       idx   = 3*(*vi++);
396215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3963f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3964f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3965f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
396615091d37SBarry Smith       v    += 9;
396715091d37SBarry Smith     }
396815091d37SBarry Smith     v        = aa +  9*diag[i];
3969f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3970f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3971f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
397215091d37SBarry Smith   }
397315091d37SBarry Smith 
3974d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39751ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3976dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
397715091d37SBarry Smith   PetscFunctionReturn(0);
397815091d37SBarry Smith }
397915091d37SBarry Smith 
39804a2ae208SSatish Balay #undef __FUNCT__
3981cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3982cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3983cee9d6f2SShri Abhyankar {
3984cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3985ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3986cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3987cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3988cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3989cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3990cee9d6f2SShri Abhyankar     PetscScalar       *x;
3991cee9d6f2SShri Abhyankar     const PetscScalar *b;
3992cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
3993cee9d6f2SShri Abhyankar 
3994cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3995cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3996cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3997cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3998cee9d6f2SShri Abhyankar     idx    = 0;
3999cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4000cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4001cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
4002cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4003cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4004cee9d6f2SShri Abhyankar       idx   = bs*i;
4005cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4006ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4007ce3d78c0SShri Abhyankar          jdx   = bs*vi[k];
4008cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4009cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4010cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4011cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4012cee9d6f2SShri Abhyankar 
4013cee9d6f2SShri Abhyankar           v   +=  bs2;
4014cee9d6f2SShri Abhyankar         }
4015cee9d6f2SShri Abhyankar 
4016cee9d6f2SShri Abhyankar        x[idx]   = s1;
4017cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4018cee9d6f2SShri Abhyankar        x[2+idx] = s3;
4019cee9d6f2SShri Abhyankar     }
4020cee9d6f2SShri Abhyankar 
4021cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4022cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4023cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
4024cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4025cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4026cee9d6f2SShri Abhyankar      idt = bs*i;
4027cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4028cee9d6f2SShri Abhyankar 
4029ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4030ce3d78c0SShri Abhyankar        idx   = bs*vi[k];
4031cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4032cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4033cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4034cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4035cee9d6f2SShri Abhyankar 
4036cee9d6f2SShri Abhyankar         v   +=  bs2;
4037cee9d6f2SShri Abhyankar     }
4038cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4039cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4040cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4041cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4042cee9d6f2SShri Abhyankar 
4043cee9d6f2SShri Abhyankar   }
4044cee9d6f2SShri Abhyankar 
4045cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4046cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4047cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4048cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4049cee9d6f2SShri Abhyankar }
4050cee9d6f2SShri Abhyankar 
4051cee9d6f2SShri Abhyankar #undef __FUNCT__
4052b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4053b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4054b2b2dd24SShri Abhyankar {
4055b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4056b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4057b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4058b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
4059b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4060b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4061b2b2dd24SShri Abhyankar     PetscScalar       *x;
4062b2b2dd24SShri Abhyankar     const PetscScalar *b;
4063b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4064b2b2dd24SShri Abhyankar 
4065b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4066b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4067b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4068b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4069b2b2dd24SShri Abhyankar     idx    = 0;
4070b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4071b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4072b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4073b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4074b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4075b2b2dd24SShri Abhyankar       idx   = bs*i;
4076b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4077b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4078b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4079b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4080b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4081b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4082b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4083b2b2dd24SShri Abhyankar 
4084b2b2dd24SShri Abhyankar           v   +=  bs2;
4085b2b2dd24SShri Abhyankar         }
4086b2b2dd24SShri Abhyankar 
4087b2b2dd24SShri Abhyankar        x[idx]   = s1;
4088b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4089b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4090b2b2dd24SShri Abhyankar     }
4091b2b2dd24SShri Abhyankar 
4092b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4093b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4094b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4095b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4096b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4097b2b2dd24SShri Abhyankar      idt = bs*i;
4098b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4099b2b2dd24SShri Abhyankar 
4100b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4101b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4102b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4103b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4104b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4105b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4106b2b2dd24SShri Abhyankar 
4107b2b2dd24SShri Abhyankar         v   +=  bs2;
4108b2b2dd24SShri Abhyankar     }
4109b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4110b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4111b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4112b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4113b2b2dd24SShri Abhyankar 
4114b2b2dd24SShri Abhyankar   }
4115b2b2dd24SShri Abhyankar 
4116b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4117b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4118b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4119b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4120b2b2dd24SShri Abhyankar }
4121b2b2dd24SShri Abhyankar 
4122b2b2dd24SShri Abhyankar #undef __FUNCT__
41234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4124dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
41254e2b4712SSatish Balay {
41264e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
41274e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
41286849ba73SBarry Smith   PetscErrorCode    ierr;
41295d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
41305d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4131d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4132d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4133d9fead3dSBarry Smith   const PetscScalar *b;
41344e2b4712SSatish Balay 
41354e2b4712SSatish Balay   PetscFunctionBegin;
4136d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4138f1af5d2fSBarry Smith   t  = a->solve_work;
41394e2b4712SSatish Balay 
41404e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
41414e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
41424e2b4712SSatish Balay 
41434e2b4712SSatish Balay   /* forward solve the lower triangular */
41444e2b4712SSatish Balay   idx    = 2*(*r++);
4145f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
41464e2b4712SSatish Balay   for (i=1; i<n; i++) {
41474e2b4712SSatish Balay     v     = aa + 4*ai[i];
41484e2b4712SSatish Balay     vi    = aj + ai[i];
41494e2b4712SSatish Balay     nz    = diag[i] - ai[i];
41504e2b4712SSatish Balay     idx   = 2*(*r++);
4151f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
41524e2b4712SSatish Balay     while (nz--) {
41534e2b4712SSatish Balay       idx   = 2*(*vi++);
4154f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4155f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4156f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
41574e2b4712SSatish Balay       v += 4;
41584e2b4712SSatish Balay     }
41594e2b4712SSatish Balay     idx = 2*i;
4160f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
41614e2b4712SSatish Balay   }
41624e2b4712SSatish Balay   /* backward solve the upper triangular */
41634e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
41644e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
41654e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
41664e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
41674e2b4712SSatish Balay     idt  = 2*i;
4168f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
41694e2b4712SSatish Balay     while (nz--) {
41704e2b4712SSatish Balay       idx   = 2*(*vi++);
4171f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4172f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4173f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
41744e2b4712SSatish Balay       v += 4;
41754e2b4712SSatish Balay     }
41764e2b4712SSatish Balay     idc = 2*(*c--);
41774e2b4712SSatish Balay     v   = aa + 4*diag[i];
4178f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4179f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
41804e2b4712SSatish Balay   }
41814e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
41824e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4183d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41841ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4185dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
41864e2b4712SSatish Balay   PetscFunctionReturn(0);
41874e2b4712SSatish Balay }
41884e2b4712SSatish Balay 
41898f690400SShri Abhyankar #undef __FUNCT__
41908f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
41918f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
41928f690400SShri Abhyankar {
41938f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
41948f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
41958f690400SShri Abhyankar   PetscErrorCode    ierr;
419629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
41978f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
41988f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
41998f690400SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
42008f690400SShri Abhyankar   const PetscScalar *b;
42018f690400SShri Abhyankar 
42028f690400SShri Abhyankar   PetscFunctionBegin;
42038f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42048f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
42058f690400SShri Abhyankar   t  = a->solve_work;
42068f690400SShri Abhyankar 
42078f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
420829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
42098f690400SShri Abhyankar 
42108f690400SShri Abhyankar   /* forward solve the lower triangular */
421129b92fc1SShri Abhyankar   idx    = 2*r[0];
42128f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
42138f690400SShri Abhyankar   for (i=1; i<n; i++) {
42148f690400SShri Abhyankar     v     = aa + 4*ai[i];
42158f690400SShri Abhyankar     vi    = aj + ai[i];
42168f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
421729b92fc1SShri Abhyankar     idx   = 2*r[i];
42188f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
421929b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
422029b92fc1SShri Abhyankar       jdx   = 2*vi[m];
42218f690400SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
42228f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
42238f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
42248f690400SShri Abhyankar       v += 4;
42258f690400SShri Abhyankar     }
42268f690400SShri Abhyankar     idx = 2*i;
42278f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
42288f690400SShri Abhyankar   }
42298f690400SShri Abhyankar   /* backward solve the upper triangular */
42308f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
42318f690400SShri Abhyankar     k = 2*n-i;
42328f690400SShri Abhyankar     v    = aa + 4*ai[k];
42338f690400SShri Abhyankar     vi   = aj + ai[k];
42348f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
42358f690400SShri Abhyankar     idt  = 2*i;
42368f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
423729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
423829b92fc1SShri Abhyankar       idx   = 2*vi[m];
42398f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
42408f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
42418f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
42428f690400SShri Abhyankar       v += 4;
42438f690400SShri Abhyankar     }
424429b92fc1SShri Abhyankar     idc = 2*c[i];
42458f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
42468f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
42478f690400SShri Abhyankar   }
42488f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
42498f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
42508f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42518f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
42528f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
42538f690400SShri Abhyankar   PetscFunctionReturn(0);
42548f690400SShri Abhyankar }
42558f690400SShri Abhyankar 
4256*0c4413a7SShri Abhyankar #undef __FUNCT__
4257*0c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
4258*0c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4259*0c4413a7SShri Abhyankar {
4260*0c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
4261*0c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
4262*0c4413a7SShri Abhyankar   PetscErrorCode    ierr;
4263*0c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
4264*0c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
4265*0c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
4266*0c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
4267*0c4413a7SShri Abhyankar   const PetscScalar *b;
4268*0c4413a7SShri Abhyankar 
4269*0c4413a7SShri Abhyankar   PetscFunctionBegin;
4270*0c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4271*0c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4272*0c4413a7SShri Abhyankar   t  = a->solve_work;
4273*0c4413a7SShri Abhyankar 
4274*0c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
4275*0c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
4276*0c4413a7SShri Abhyankar 
4277*0c4413a7SShri Abhyankar   /* forward solve the lower triangular */
4278*0c4413a7SShri Abhyankar   idx    = 2*r[0];
4279*0c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
4280*0c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
4281*0c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
4282*0c4413a7SShri Abhyankar     vi    = aj + ai[i];
4283*0c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
4284*0c4413a7SShri Abhyankar     idx   = 2*r[i];
4285*0c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
4286*0c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
4287*0c4413a7SShri Abhyankar       jdx   = 2*vi[m];
4288*0c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
4289*0c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
4290*0c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
4291*0c4413a7SShri Abhyankar       v += 4;
4292*0c4413a7SShri Abhyankar     }
4293*0c4413a7SShri Abhyankar     idx = 2*i;
4294*0c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
4295*0c4413a7SShri Abhyankar   }
4296*0c4413a7SShri Abhyankar   /* backward solve the upper triangular */
4297*0c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
4298*0c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
4299*0c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
4300*0c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
4301*0c4413a7SShri Abhyankar     idt  = 2*i;
4302*0c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
4303*0c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
4304*0c4413a7SShri Abhyankar       idx   = 2*vi[m];
4305*0c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
4306*0c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
4307*0c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
4308*0c4413a7SShri Abhyankar       v += 4;
4309*0c4413a7SShri Abhyankar     }
4310*0c4413a7SShri Abhyankar     idc = 2*c[i];
4311*0c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4312*0c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
4313*0c4413a7SShri Abhyankar   }
4314*0c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
4315*0c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4316*0c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4317*0c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4318*0c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4319*0c4413a7SShri Abhyankar   PetscFunctionReturn(0);
4320*0c4413a7SShri Abhyankar }
43218f690400SShri Abhyankar 
432215091d37SBarry Smith /*
432315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
432415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
432515091d37SBarry Smith */
43264a2ae208SSatish Balay #undef __FUNCT__
43274a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4328dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
432915091d37SBarry Smith {
433015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4331690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4332dfbe8321SBarry Smith   PetscErrorCode    ierr;
4333690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4334d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4335d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
4336d9fead3dSBarry Smith   const PetscScalar *b;
4337690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
433815091d37SBarry Smith 
433915091d37SBarry Smith   PetscFunctionBegin;
4340d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
434215091d37SBarry Smith 
434315091d37SBarry Smith   /* forward solve the lower triangular */
434415091d37SBarry Smith   idx    = 0;
434515091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
434615091d37SBarry Smith   for (i=1; i<n; i++) {
434715091d37SBarry Smith     v     =  aa      + 4*ai[i];
434815091d37SBarry Smith     vi    =  aj      + ai[i];
434915091d37SBarry Smith     nz    =  diag[i] - ai[i];
435015091d37SBarry Smith     idx   +=  2;
4351f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
435215091d37SBarry Smith     while (nz--) {
435315091d37SBarry Smith       jdx   = 2*(*vi++);
435415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
4355f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4356f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
435715091d37SBarry Smith       v    += 4;
435815091d37SBarry Smith     }
4359f1af5d2fSBarry Smith     x[idx]   = s1;
4360f1af5d2fSBarry Smith     x[1+idx] = s2;
436115091d37SBarry Smith   }
436215091d37SBarry Smith   /* backward solve the upper triangular */
436315091d37SBarry Smith   for (i=n-1; i>=0; i--){
436415091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
436515091d37SBarry Smith     vi   = aj + diag[i] + 1;
436615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
436715091d37SBarry Smith     idt  = 2*i;
4368f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
436915091d37SBarry Smith     while (nz--) {
437015091d37SBarry Smith       idx   = 2*(*vi++);
437115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
4372f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4373f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
437415091d37SBarry Smith       v    += 4;
437515091d37SBarry Smith     }
437615091d37SBarry Smith     v        = aa +  4*diag[i];
4377f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
4378f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
437915091d37SBarry Smith   }
438015091d37SBarry Smith 
4381d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43821ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4383dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
438415091d37SBarry Smith   PetscFunctionReturn(0);
438515091d37SBarry Smith }
438615091d37SBarry Smith 
43874a2ae208SSatish Balay #undef __FUNCT__
4388cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4389cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4390cee9d6f2SShri Abhyankar {
4391cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4392ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4393cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4394cee9d6f2SShri Abhyankar     PetscInt          jdx;
4395cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4396cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4397cee9d6f2SShri Abhyankar     const PetscScalar *b;
4398cee9d6f2SShri Abhyankar 
4399cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4400cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4401cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4402cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4403cee9d6f2SShri Abhyankar     idx    = 0;
4404cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4405cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4406cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
4407cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4408cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4409cee9d6f2SShri Abhyankar        idx  = 2*i;
4410cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4411ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4412ce3d78c0SShri Abhyankar          jdx   = 2*vi[k];
4413cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4414cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4415cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4416cee9d6f2SShri Abhyankar            v   +=  4;
4417cee9d6f2SShri Abhyankar         }
4418cee9d6f2SShri Abhyankar        x[idx]   = s1;
4419cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4420cee9d6f2SShri Abhyankar     }
4421cee9d6f2SShri Abhyankar 
4422cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4423cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4424cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
4425cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4426cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4427cee9d6f2SShri Abhyankar      idt = 2*i;
4428cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4429ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4430ce3d78c0SShri Abhyankar       idx   = 2*vi[k];
4431cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4432cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4433cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4434cee9d6f2SShri Abhyankar          v    += 4;
4435cee9d6f2SShri Abhyankar     }
4436cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4437cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4438cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4439cee9d6f2SShri Abhyankar   }
4440cee9d6f2SShri Abhyankar 
4441cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4442cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4443cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4444cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4445cee9d6f2SShri Abhyankar }
4446cee9d6f2SShri Abhyankar 
4447cee9d6f2SShri Abhyankar #undef __FUNCT__
4448b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4449b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4450b2b2dd24SShri Abhyankar {
4451b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4452b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4453b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4454b2b2dd24SShri Abhyankar     PetscInt          jdx;
4455b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4456b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4457b2b2dd24SShri Abhyankar     const PetscScalar *b;
4458b2b2dd24SShri Abhyankar 
4459b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4460b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4461b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4462b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4463b2b2dd24SShri Abhyankar     idx    = 0;
4464b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4465b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4466b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4467b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4468b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4469b2b2dd24SShri Abhyankar        idx  = 2*i;
4470b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4471b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4472b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4473b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4474b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4475b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4476b2b2dd24SShri Abhyankar            v   +=  4;
4477b2b2dd24SShri Abhyankar         }
4478b2b2dd24SShri Abhyankar        x[idx]   = s1;
4479b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4480b2b2dd24SShri Abhyankar     }
4481b2b2dd24SShri Abhyankar 
4482b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4483b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4484b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4485b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4486b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4487b2b2dd24SShri Abhyankar      idt = 2*i;
4488b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4489b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4490b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4491b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4492b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4493b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4494b2b2dd24SShri Abhyankar          v    += 4;
4495b2b2dd24SShri Abhyankar     }
4496b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4497b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4498b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4499b2b2dd24SShri Abhyankar   }
4500b2b2dd24SShri Abhyankar 
4501b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4502b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4503b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4504b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4505b2b2dd24SShri Abhyankar }
4506b2b2dd24SShri Abhyankar 
4507b2b2dd24SShri Abhyankar #undef __FUNCT__
45084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4509dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
45104e2b4712SSatish Balay {
45114e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
45124e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
45136849ba73SBarry Smith   PetscErrorCode ierr;
45145d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
45155d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
45163f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
451787828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
45184e2b4712SSatish Balay 
45194e2b4712SSatish Balay   PetscFunctionBegin;
45204e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
45214e2b4712SSatish Balay 
45221ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
45231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4524f1af5d2fSBarry Smith   t  = a->solve_work;
45254e2b4712SSatish Balay 
45264e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
45274e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
45284e2b4712SSatish Balay 
45294e2b4712SSatish Balay   /* forward solve the lower triangular */
4530f1af5d2fSBarry Smith   t[0] = b[*r++];
45314e2b4712SSatish Balay   for (i=1; i<n; i++) {
45324e2b4712SSatish Balay     v     = aa + ai[i];
45334e2b4712SSatish Balay     vi    = aj + ai[i];
45344e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4535f1af5d2fSBarry Smith     s1  = b[*r++];
45364e2b4712SSatish Balay     while (nz--) {
4537f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
45384e2b4712SSatish Balay     }
4539f1af5d2fSBarry Smith     t[i] = s1;
45404e2b4712SSatish Balay   }
45414e2b4712SSatish Balay   /* backward solve the upper triangular */
45424e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
45434e2b4712SSatish Balay     v    = aa + diag[i] + 1;
45444e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
45454e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4546f1af5d2fSBarry Smith     s1 = t[i];
45474e2b4712SSatish Balay     while (nz--) {
4548f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
45494e2b4712SSatish Balay     }
4550f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
45514e2b4712SSatish Balay   }
45524e2b4712SSatish Balay 
45534e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45544e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
45551ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
45561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4557dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
45584e2b4712SSatish Balay   PetscFunctionReturn(0);
45594e2b4712SSatish Balay }
456015091d37SBarry Smith /*
456115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
456215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
456315091d37SBarry Smith */
45644a2ae208SSatish Balay #undef __FUNCT__
45654a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4566dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
456715091d37SBarry Smith {
456815091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4569690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4570dfbe8321SBarry Smith   PetscErrorCode ierr;
4571690b6cddSBarry Smith   PetscInt       *diag = a->diag;
457215091d37SBarry Smith   MatScalar      *aa=a->a;
457387828ca2SBarry Smith   PetscScalar    *x,*b;
457487828ca2SBarry Smith   PetscScalar    s1,x1;
457515091d37SBarry Smith   MatScalar      *v;
4576690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
457715091d37SBarry Smith 
457815091d37SBarry Smith   PetscFunctionBegin;
45791ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
45801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
458115091d37SBarry Smith 
458215091d37SBarry Smith   /* forward solve the lower triangular */
458315091d37SBarry Smith   idx    = 0;
458415091d37SBarry Smith   x[0]   = b[0];
458515091d37SBarry Smith   for (i=1; i<n; i++) {
458615091d37SBarry Smith     v     =  aa      + ai[i];
458715091d37SBarry Smith     vi    =  aj      + ai[i];
458815091d37SBarry Smith     nz    =  diag[i] - ai[i];
458915091d37SBarry Smith     idx   +=  1;
4590f1af5d2fSBarry Smith     s1  =  b[idx];
459115091d37SBarry Smith     while (nz--) {
459215091d37SBarry Smith       jdx   = *vi++;
459315091d37SBarry Smith       x1    = x[jdx];
4594f1af5d2fSBarry Smith       s1 -= v[0]*x1;
459515091d37SBarry Smith       v    += 1;
459615091d37SBarry Smith     }
4597f1af5d2fSBarry Smith     x[idx]   = s1;
459815091d37SBarry Smith   }
459915091d37SBarry Smith   /* backward solve the upper triangular */
460015091d37SBarry Smith   for (i=n-1; i>=0; i--){
460115091d37SBarry Smith     v    = aa + diag[i] + 1;
460215091d37SBarry Smith     vi   = aj + diag[i] + 1;
460315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
460415091d37SBarry Smith     idt  = i;
4605f1af5d2fSBarry Smith     s1 = x[idt];
460615091d37SBarry Smith     while (nz--) {
460715091d37SBarry Smith       idx   = *vi++;
460815091d37SBarry Smith       x1    = x[idx];
4609f1af5d2fSBarry Smith       s1 -= v[0]*x1;
461015091d37SBarry Smith       v    += 1;
461115091d37SBarry Smith     }
461215091d37SBarry Smith     v        = aa +  diag[i];
4613f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
461415091d37SBarry Smith   }
46151ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
46161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4617dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
461815091d37SBarry Smith   PetscFunctionReturn(0);
461915091d37SBarry Smith }
46204e2b4712SSatish Balay 
46214e2b4712SSatish Balay /* ----------------------------------------------------------------*/
462216a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
46236bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
46246bce7ff8SHong Zhang 
46256bce7ff8SHong Zhang #undef __FUNCT__
46266bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
46276bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
46286bce7ff8SHong Zhang {
46296bce7ff8SHong Zhang   Mat            C=B;
46306bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
46316bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
46326bce7ff8SHong Zhang   PetscErrorCode ierr;
46336bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
46346bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
46356bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4636b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4637914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4638914a18a2SHong Zhang   MatScalar      *v_work;
46396bce7ff8SHong Zhang 
46406bce7ff8SHong Zhang   PetscFunctionBegin;
46416bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
46426bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4643914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4644914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
46456bce7ff8SHong Zhang   ics  = ic;
46466bce7ff8SHong Zhang 
4647914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
4648914a18a2SHong Zhang   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4649b588c5a2SHong Zhang   mwork    = v_work + bs;
4650b588c5a2SHong Zhang   v_pivots = (PetscInt*)(mwork + bs2);
4651914a18a2SHong Zhang 
46526bce7ff8SHong Zhang   for (i=0; i<n; i++){
46536bce7ff8SHong Zhang     /* zero rtmp */
46546bce7ff8SHong Zhang     /* L part */
46556bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
46566bce7ff8SHong Zhang     bjtmp = bj + bi[i];
4657914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4658914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4659914a18a2SHong Zhang     }
46606bce7ff8SHong Zhang 
46616bce7ff8SHong Zhang     /* U part */
46626bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
46636bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
4664914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4665914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4666914a18a2SHong Zhang     }
46676bce7ff8SHong Zhang 
46686bce7ff8SHong Zhang     /* load in initial (unfactored row) */
46696bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
46706bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
4671914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
46726bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4673914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
46746bce7ff8SHong Zhang     }
46756bce7ff8SHong Zhang 
46766bce7ff8SHong Zhang     /* elimination */
46776bce7ff8SHong Zhang     bjtmp = bj + bi[i];
46786bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
4679b1646270SShri Abhyankar     for(k=0;k < nzL;k++) {
4680b1646270SShri Abhyankar       row = bjtmp[k];
4681914a18a2SHong Zhang       pc = rtmp + bs2*row;
4682914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4683914a18a2SHong Zhang       if (flg) {
4684914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
4685b588c5a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
46866bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
4687914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
46886bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
4689914a18a2SHong Zhang         for (j=0; j<nz; j++) {
4690914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4691914a18a2SHong Zhang         }
4692b588c5a2SHong Zhang         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
46936bce7ff8SHong Zhang       }
46946bce7ff8SHong Zhang     }
46956bce7ff8SHong Zhang 
46966bce7ff8SHong Zhang     /* finished row so stick it into b->a */
46976bce7ff8SHong Zhang     /* L part */
4698914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
46996bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
47006bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
47016bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4702914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
47036bce7ff8SHong Zhang     }
47046bce7ff8SHong Zhang 
47056bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
4706914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
47076bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
4708914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4709914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4710914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
47116bce7ff8SHong Zhang 
47126bce7ff8SHong Zhang     /* U part */
4713914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
47146bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
47156bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
4716914a18a2SHong Zhang     for (j=0; j<nz; j++){
4717914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4718914a18a2SHong Zhang     }
47196bce7ff8SHong Zhang   }
47206bce7ff8SHong Zhang 
47216bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
47226bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
47236bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
47246bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
472527019359SHong Zhang 
47266bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
4727914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
47286bce7ff8SHong Zhang   PetscFunctionReturn(0);
47296bce7ff8SHong Zhang }
47306bce7ff8SHong Zhang 
47311a83e813SShri Abhyankar #undef __FUNCT__
47321a83e813SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2"
47331a83e813SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
47341a83e813SShri Abhyankar {
47351a83e813SShri Abhyankar   Mat            C=B;
47361a83e813SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
47371a83e813SShri Abhyankar   IS             isrow = b->row,isicol = b->icol;
47381a83e813SShri Abhyankar   PetscErrorCode ierr;
47391a83e813SShri Abhyankar   const PetscInt *r,*ic,*ics;
47401a83e813SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
47411a83e813SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
47421a83e813SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
47431a83e813SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
47441a83e813SShri Abhyankar   MatScalar      *v_work;
47451a83e813SShri Abhyankar 
47461a83e813SShri Abhyankar   PetscFunctionBegin;
47471a83e813SShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
47481a83e813SShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
47491a83e813SShri Abhyankar   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
47501a83e813SShri Abhyankar   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
47511a83e813SShri Abhyankar   ics  = ic;
47521a83e813SShri Abhyankar 
47531a83e813SShri Abhyankar   /* generate work space needed by dense LU factorization */
47541a83e813SShri Abhyankar   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
47551a83e813SShri Abhyankar   mwork    = v_work + bs;
47561a83e813SShri Abhyankar   v_pivots = (PetscInt*)(mwork + bs2);
47571a83e813SShri Abhyankar 
47581a83e813SShri Abhyankar   for (i=0; i<n; i++){
47591a83e813SShri Abhyankar     /* zero rtmp */
47601a83e813SShri Abhyankar     /* L part */
47611a83e813SShri Abhyankar     nz    = bi[i+1] - bi[i];
47621a83e813SShri Abhyankar     bjtmp = bj + bi[i];
47631a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
47641a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
47651a83e813SShri Abhyankar     }
47661a83e813SShri Abhyankar 
47671a83e813SShri Abhyankar     /* U part */
47681a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
47691a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
47701a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
47711a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
47721a83e813SShri Abhyankar     }
47731a83e813SShri Abhyankar 
47741a83e813SShri Abhyankar     /* load in initial (unfactored row) */
47751a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
47761a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
47771a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
47781a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
47791a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
47801a83e813SShri Abhyankar     }
47811a83e813SShri Abhyankar 
47821a83e813SShri Abhyankar     /* elimination */
47831a83e813SShri Abhyankar     bjtmp = bj + bi[i];
47841a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
47851a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
47861a83e813SShri Abhyankar       row = bjtmp[k];
47871a83e813SShri Abhyankar       pc = rtmp + bs2*row;
47881a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
47891a83e813SShri Abhyankar       if (flg) {
47901a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
47911a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
47921a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
47931a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
47941a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
47951a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
47961a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
47971a83e813SShri Abhyankar         }
47981a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
47991a83e813SShri Abhyankar       }
48001a83e813SShri Abhyankar     }
48011a83e813SShri Abhyankar 
48021a83e813SShri Abhyankar     /* finished row so stick it into b->a */
48031a83e813SShri Abhyankar     /* L part */
48041a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
48051a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
48061a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
48071a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
48081a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
48091a83e813SShri Abhyankar     }
48101a83e813SShri Abhyankar 
48111a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
48121a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
48131a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
48141a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
48151a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
48161a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
48171a83e813SShri Abhyankar 
48181a83e813SShri Abhyankar     /* U part */
48191a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
48201a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
48211a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
48221a83e813SShri Abhyankar     for (j=0; j<nz; j++){
48231a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
48241a83e813SShri Abhyankar     }
48251a83e813SShri Abhyankar   }
48261a83e813SShri Abhyankar 
48271a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
48281a83e813SShri Abhyankar   ierr = PetscFree(v_work);CHKERRQ(ierr);
48291a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
48301a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
48311a83e813SShri Abhyankar 
48321a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
48331a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
48341a83e813SShri Abhyankar   PetscFunctionReturn(0);
48351a83e813SShri Abhyankar }
48361a83e813SShri Abhyankar 
48376bce7ff8SHong Zhang /*
48386bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
483916a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
484016a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
48416bce7ff8SHong Zhang */
48426bce7ff8SHong Zhang #undef __FUNCT__
48436bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
48446bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
48456bce7ff8SHong Zhang {
48466bce7ff8SHong Zhang 
48476bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
48486bce7ff8SHong Zhang   PetscErrorCode     ierr;
484916a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
485016a2bf60SHong Zhang   PetscInt           i,j,nz,*bi,*bj,*bdiag;
48516bce7ff8SHong Zhang 
48526bce7ff8SHong Zhang   PetscFunctionBegin;
485316a2bf60SHong Zhang   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
485416a2bf60SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
48556bce7ff8SHong Zhang   b    = (Mat_SeqBAIJ*)(fact)->data;
485616a2bf60SHong Zhang 
485716a2bf60SHong Zhang   /* allocate matrix arrays for new data structure */
485816a2bf60SHong Zhang   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
485916a2bf60SHong Zhang   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
486016a2bf60SHong Zhang   b->singlemalloc = PETSC_TRUE;
486116a2bf60SHong Zhang   if (!b->diag){
486216a2bf60SHong Zhang     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
486316a2bf60SHong Zhang   }
4864914a18a2SHong Zhang   bdiag = b->diag;
48656bce7ff8SHong Zhang 
486616a2bf60SHong Zhang   if (n > 0) {
486716a2bf60SHong Zhang     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
48686bce7ff8SHong Zhang   }
48696bce7ff8SHong Zhang 
48706bce7ff8SHong Zhang   /* set bi and bj with new data structure */
48716bce7ff8SHong Zhang   bi = b->i;
48726bce7ff8SHong Zhang   bj = b->j;
48736bce7ff8SHong Zhang 
48746bce7ff8SHong Zhang   /* L part */
48756bce7ff8SHong Zhang   bi[0] = 0;
487616a2bf60SHong Zhang   for (i=0; i<n; i++){
48776bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
4878914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
48796bce7ff8SHong Zhang     aj = a->j + ai[i];
48806bce7ff8SHong Zhang     for (j=0; j<nz; j++){
48816bce7ff8SHong Zhang       *bj = aj[j]; bj++;
48826bce7ff8SHong Zhang     }
48836bce7ff8SHong Zhang   }
48846bce7ff8SHong Zhang 
48856bce7ff8SHong Zhang   /* U part */
488616a2bf60SHong Zhang   bi[n+1] = bi[n];
488716a2bf60SHong Zhang   for (i=n-1; i>=0; i--){
48886bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
488916a2bf60SHong Zhang     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
48906bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
48916bce7ff8SHong Zhang     for (j=0; j<nz; j++){
48926bce7ff8SHong Zhang       *bj = aj[j]; bj++;
48936bce7ff8SHong Zhang     }
48946bce7ff8SHong Zhang     /* diag[i] */
48956bce7ff8SHong Zhang     *bj = i; bj++;
489616a2bf60SHong Zhang     bdiag[i] = bi[2*n-i+1]-1;
48976bce7ff8SHong Zhang   }
48986bce7ff8SHong Zhang   PetscFunctionReturn(0);
48996bce7ff8SHong Zhang }
49006bce7ff8SHong Zhang 
490116a2bf60SHong Zhang #undef __FUNCT__
490216a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
490316a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
490416a2bf60SHong Zhang {
490516a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
490616a2bf60SHong Zhang   IS                 isicol;
490716a2bf60SHong Zhang   PetscErrorCode     ierr;
490816a2bf60SHong Zhang   const PetscInt     *r,*ic;
49097fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
491016a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
491116a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
491216a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
49137fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
491416a2bf60SHong Zhang   PetscReal          f;
491516a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
491616a2bf60SHong Zhang   PetscBT            lnkbt;
491716a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
491816a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
491916a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
492016a2bf60SHong Zhang   PetscTruth         missing;
49217fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
492216a2bf60SHong Zhang 
492316a2bf60SHong Zhang   PetscFunctionBegin;
492416a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
492516a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
492616a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
492716a2bf60SHong Zhang 
492816a2bf60SHong Zhang   f             = info->fill;
492916a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
493016a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
493116a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
493216a2bf60SHong Zhang 
493316a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
493416a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
49357fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
493616a2bf60SHong Zhang 
49377fa3a6a0SHong Zhang   if (!levels && both_identity) {
493816a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
493916a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
494016a2bf60SHong Zhang     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
49417fa3a6a0SHong Zhang     /* set MatSolve routines */
49427fa3a6a0SHong Zhang     switch (bs){
49437fa3a6a0SHong Zhang     case 2:
49447fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
49457fa3a6a0SHong Zhang       break;
49467fa3a6a0SHong Zhang     case 3:
49477fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
49487fa3a6a0SHong Zhang       break;
49497fa3a6a0SHong Zhang     case 4:
49507fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
49517fa3a6a0SHong Zhang       break;
49527fa3a6a0SHong Zhang     case 5:
49537fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
49547fa3a6a0SHong Zhang       break;
49557fa3a6a0SHong Zhang     case 6:
49567fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
49577fa3a6a0SHong Zhang       break;
49587fa3a6a0SHong Zhang     case 7:
49597fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
49607fa3a6a0SHong Zhang       break;
49617fa3a6a0SHong Zhang     default:
49627fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
49637fa3a6a0SHong Zhang       break;
49647fa3a6a0SHong Zhang     }
496516a2bf60SHong Zhang 
496616a2bf60SHong Zhang     fact->factor = MAT_FACTOR_ILU;
496716a2bf60SHong Zhang     (fact)->info.factor_mallocs    = 0;
496816a2bf60SHong Zhang     (fact)->info.fill_ratio_given  = info->fill;
496916a2bf60SHong Zhang     (fact)->info.fill_ratio_needed = 1.0;
497016a2bf60SHong Zhang     b                = (Mat_SeqBAIJ*)(fact)->data;
497116a2bf60SHong Zhang     b->row           = isrow;
497216a2bf60SHong Zhang     b->col           = iscol;
497316a2bf60SHong Zhang     b->icol          = isicol;
497416a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
497516a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
497616a2bf60SHong Zhang     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4977b588c5a2SHong Zhang     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
497816a2bf60SHong Zhang     PetscFunctionReturn(0);
497916a2bf60SHong Zhang   }
498016a2bf60SHong Zhang 
498116a2bf60SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
498216a2bf60SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
498316a2bf60SHong Zhang 
498416a2bf60SHong Zhang   /* get new row pointers */
498516a2bf60SHong Zhang   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
498616a2bf60SHong Zhang   bi[0] = 0;
498716a2bf60SHong Zhang   /* bdiag is location of diagonal in factor */
498816a2bf60SHong Zhang   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
498916a2bf60SHong Zhang   bdiag[0]  = 0;
499016a2bf60SHong Zhang 
499116a2bf60SHong Zhang   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
499216a2bf60SHong Zhang   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
499316a2bf60SHong Zhang 
499416a2bf60SHong Zhang   /* create a linked list for storing column indices of the active row */
499516a2bf60SHong Zhang   nlnk = n + 1;
499616a2bf60SHong Zhang   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
499716a2bf60SHong Zhang 
499816a2bf60SHong Zhang   /* initial FreeSpace size is f*(ai[n]+1) */
499916a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
500016a2bf60SHong Zhang   current_space = free_space;
500116a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
500216a2bf60SHong Zhang   current_space_lvl = free_space_lvl;
500316a2bf60SHong Zhang 
500416a2bf60SHong Zhang   for (i=0; i<n; i++) {
500516a2bf60SHong Zhang     nzi = 0;
500616a2bf60SHong Zhang     /* copy current row into linked list */
500716a2bf60SHong Zhang     nnz  = ai[r[i]+1] - ai[r[i]];
500816a2bf60SHong Zhang     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
500916a2bf60SHong Zhang     cols = aj + ai[r[i]];
501016a2bf60SHong Zhang     lnk[i] = -1; /* marker to indicate if diagonal exists */
501116a2bf60SHong Zhang     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
501216a2bf60SHong Zhang     nzi += nlnk;
501316a2bf60SHong Zhang 
501416a2bf60SHong Zhang     /* make sure diagonal entry is included */
501516a2bf60SHong Zhang     if (diagonal_fill && lnk[i] == -1) {
501616a2bf60SHong Zhang       fm = n;
501716a2bf60SHong Zhang       while (lnk[fm] < i) fm = lnk[fm];
501816a2bf60SHong Zhang       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
501916a2bf60SHong Zhang       lnk[fm]    = i;
502016a2bf60SHong Zhang       lnk_lvl[i] = 0;
502116a2bf60SHong Zhang       nzi++; dcount++;
502216a2bf60SHong Zhang     }
502316a2bf60SHong Zhang 
502416a2bf60SHong Zhang     /* add pivot rows into the active row */
502516a2bf60SHong Zhang     nzbd = 0;
502616a2bf60SHong Zhang     prow = lnk[n];
502716a2bf60SHong Zhang     while (prow < i) {
502816a2bf60SHong Zhang       nnz      = bdiag[prow];
502916a2bf60SHong Zhang       cols     = bj_ptr[prow] + nnz + 1;
503016a2bf60SHong Zhang       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
503116a2bf60SHong Zhang       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
503216a2bf60SHong Zhang       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
503316a2bf60SHong Zhang       nzi += nlnk;
503416a2bf60SHong Zhang       prow = lnk[prow];
503516a2bf60SHong Zhang       nzbd++;
503616a2bf60SHong Zhang     }
503716a2bf60SHong Zhang     bdiag[i] = nzbd;
503816a2bf60SHong Zhang     bi[i+1]  = bi[i] + nzi;
503916a2bf60SHong Zhang 
504016a2bf60SHong Zhang     /* if free space is not available, make more free space */
504116a2bf60SHong Zhang     if (current_space->local_remaining<nzi) {
504216a2bf60SHong Zhang       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
504316a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
504416a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
504516a2bf60SHong Zhang       reallocs++;
504616a2bf60SHong Zhang     }
504716a2bf60SHong Zhang 
504816a2bf60SHong Zhang     /* copy data into free_space and free_space_lvl, then initialize lnk */
504916a2bf60SHong Zhang     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
505016a2bf60SHong Zhang     bj_ptr[i]    = current_space->array;
505116a2bf60SHong Zhang     bjlvl_ptr[i] = current_space_lvl->array;
505216a2bf60SHong Zhang 
505316a2bf60SHong Zhang     /* make sure the active row i has diagonal entry */
505416a2bf60SHong Zhang     if (*(bj_ptr[i]+bdiag[i]) != i) {
505516a2bf60SHong Zhang       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
505616a2bf60SHong Zhang     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
505716a2bf60SHong Zhang     }
505816a2bf60SHong Zhang 
505916a2bf60SHong Zhang     current_space->array           += nzi;
506016a2bf60SHong Zhang     current_space->local_used      += nzi;
506116a2bf60SHong Zhang     current_space->local_remaining -= nzi;
506216a2bf60SHong Zhang     current_space_lvl->array           += nzi;
506316a2bf60SHong Zhang     current_space_lvl->local_used      += nzi;
506416a2bf60SHong Zhang     current_space_lvl->local_remaining -= nzi;
506516a2bf60SHong Zhang   }
506616a2bf60SHong Zhang 
506716a2bf60SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
506816a2bf60SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
506916a2bf60SHong Zhang 
507016a2bf60SHong Zhang   /* destroy list of free space and other temporary arrays */
507116a2bf60SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
507216a2bf60SHong Zhang 
507316a2bf60SHong Zhang   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5074783ef271SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
507516a2bf60SHong Zhang 
507616a2bf60SHong Zhang   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
507716a2bf60SHong Zhang   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
507816a2bf60SHong Zhang   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
507916a2bf60SHong Zhang 
508016a2bf60SHong Zhang #if defined(PETSC_USE_INFO)
508116a2bf60SHong Zhang   {
508216a2bf60SHong Zhang     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
508316a2bf60SHong Zhang     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
508416a2bf60SHong Zhang     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
508516a2bf60SHong Zhang     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
508616a2bf60SHong Zhang     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
508716a2bf60SHong Zhang     if (diagonal_fill) {
508816a2bf60SHong Zhang       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
508916a2bf60SHong Zhang     }
509016a2bf60SHong Zhang   }
509116a2bf60SHong Zhang #endif
509216a2bf60SHong Zhang 
509316a2bf60SHong Zhang   /* put together the new matrix */
509416a2bf60SHong Zhang   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
509516a2bf60SHong Zhang   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
509616a2bf60SHong Zhang   b = (Mat_SeqBAIJ*)(fact)->data;
509716a2bf60SHong Zhang   b->free_a       = PETSC_TRUE;
509816a2bf60SHong Zhang   b->free_ij      = PETSC_TRUE;
509916a2bf60SHong Zhang   b->singlemalloc = PETSC_FALSE;
51007fa3a6a0SHong Zhang   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
510116a2bf60SHong Zhang   b->j          = bj;
510216a2bf60SHong Zhang   b->i          = bi;
510316a2bf60SHong Zhang   b->diag       = bdiag;
51047f53bb6cSHong Zhang   b->free_diag  = PETSC_TRUE;
510516a2bf60SHong Zhang   b->ilen       = 0;
510616a2bf60SHong Zhang   b->imax       = 0;
510716a2bf60SHong Zhang   b->row        = isrow;
510816a2bf60SHong Zhang   b->col        = iscol;
510916a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
511016a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
511116a2bf60SHong Zhang   b->icol       = isicol;
51127fa3a6a0SHong Zhang   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
511316a2bf60SHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
511416a2bf60SHong Zhang      Allocate bdiag, solve_work, new a, new j */
51157fa3a6a0SHong Zhang   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
511616a2bf60SHong Zhang   b->maxnz = b->nz = bi[2*n+1] ;
511716a2bf60SHong Zhang   (fact)->info.factor_mallocs    = reallocs;
511816a2bf60SHong Zhang   (fact)->info.fill_ratio_given  = f;
511916a2bf60SHong Zhang   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
512016a2bf60SHong Zhang   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
51217fa3a6a0SHong Zhang   /* set MatSolve routines */
51227fa3a6a0SHong Zhang   if (both_identity){
51237fa3a6a0SHong Zhang     switch (bs){
51247fa3a6a0SHong Zhang     case 2:
51257fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
51267fa3a6a0SHong Zhang       break;
51277fa3a6a0SHong Zhang     case 3:
51287fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
51297fa3a6a0SHong Zhang       break;
51307fa3a6a0SHong Zhang     case 4:
51317fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
51327fa3a6a0SHong Zhang       break;
51337fa3a6a0SHong Zhang     case 5:
51347fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
51357fa3a6a0SHong Zhang       break;
51367fa3a6a0SHong Zhang     case 6:
51377fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
51387fa3a6a0SHong Zhang       break;
51397fa3a6a0SHong Zhang     case 7:
51407fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
51417fa3a6a0SHong Zhang       break;
51427fa3a6a0SHong Zhang     default:
51437fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
51447fa3a6a0SHong Zhang       break;
51457fa3a6a0SHong Zhang     }
51467fa3a6a0SHong Zhang   } else {
51477fa3a6a0SHong Zhang     switch (bs){
51487fa3a6a0SHong Zhang     case 2:
51497fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
51507fa3a6a0SHong Zhang       break;
51517fa3a6a0SHong Zhang     case 3:
51527fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
51537fa3a6a0SHong Zhang       break;
51547fa3a6a0SHong Zhang     case 4:
51557fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
51567fa3a6a0SHong Zhang       break;
51577fa3a6a0SHong Zhang     case 5:
51587fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
51597fa3a6a0SHong Zhang       break;
51607fa3a6a0SHong Zhang     case 6:
51617fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
51627fa3a6a0SHong Zhang       break;
51637fa3a6a0SHong Zhang     case 7:
51647fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
51657fa3a6a0SHong Zhang       break;
51667fa3a6a0SHong Zhang     default:
51677fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
51687fa3a6a0SHong Zhang       break;
51697fa3a6a0SHong Zhang     }
51707fa3a6a0SHong Zhang   }
517116a2bf60SHong Zhang   PetscFunctionReturn(0);
517216a2bf60SHong Zhang }
517316a2bf60SHong Zhang 
51744e2b4712SSatish Balay /*
51754e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
51764e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
51774e2b4712SSatish Balay    Not a good example of code reuse.
51784e2b4712SSatish Balay */
51794a2ae208SSatish Balay #undef __FUNCT__
51804a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
51810481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
51824e2b4712SSatish Balay {
51834e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
51844e2b4712SSatish Balay   IS             isicol;
51856849ba73SBarry Smith   PetscErrorCode ierr;
51865d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
51875d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5188a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5189d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
519041df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
5191329f5518SBarry Smith   PetscReal      f;
519216a2bf60SHong Zhang   PetscTruth     newdatastruct=PETSC_FALSE;
51934e2b4712SSatish Balay 
51944e2b4712SSatish Balay   PetscFunctionBegin;
519516a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
519616a2bf60SHong Zhang   if (newdatastruct){
519716a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
519816a2bf60SHong Zhang     PetscFunctionReturn(0);
519916a2bf60SHong Zhang   }
520016a2bf60SHong Zhang 
52016bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
52026bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
52036bce7ff8SHong Zhang 
5204435faa5fSBarry Smith   f             = info->fill;
5205690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
5206690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
52074c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
520816a2bf60SHong Zhang 
5209667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5210667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
52117d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
5212309c388cSBarry Smith 
521341df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
521416a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
52156bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
52166bce7ff8SHong Zhang 
5217719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
5218719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
5219bb3d539aSBarry Smith     b->row       = isrow;
5220bb3d539aSBarry Smith     b->col       = iscol;
5221bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5222bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5223bb3d539aSBarry Smith     b->icol      = isicol;
5224bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5225b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
52266bce7ff8SHong Zhang     PetscFunctionReturn(0);
52276bce7ff8SHong Zhang   }
52286bce7ff8SHong Zhang 
52296bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
52304e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
52314e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
52324e2b4712SSatish Balay 
52334e2b4712SSatish Balay     /* get new row pointers */
5234690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
52354e2b4712SSatish Balay     ainew[0] = 0;
52364e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
5237690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
5238690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
52394e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
5240690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
52414e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
5242690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
52434e2b4712SSatish Balay     /* im is level for each filled value */
5244690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
52454e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
5246690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
52474e2b4712SSatish Balay     dloc[0]  = 0;
52484e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
5249435faa5fSBarry Smith 
5250435faa5fSBarry Smith       /* copy prow into linked list */
52514e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
52523b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
52534e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
52544e2b4712SSatish Balay       fill[n]    = n;
5255435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
52564e2b4712SSatish Balay       while (nz--) {
52574e2b4712SSatish Balay 	fm  = n;
52584e2b4712SSatish Balay 	idx = ic[*xi++];
52594e2b4712SSatish Balay 	do {
52604e2b4712SSatish Balay 	  m  = fm;
52614e2b4712SSatish Balay 	  fm = fill[m];
52624e2b4712SSatish Balay 	} while (fm < idx);
52634e2b4712SSatish Balay 	fill[m]   = idx;
52644e2b4712SSatish Balay 	fill[idx] = fm;
52654e2b4712SSatish Balay 	im[idx]   = 0;
52664e2b4712SSatish Balay       }
5267435faa5fSBarry Smith 
5268435faa5fSBarry Smith       /* make sure diagonal entry is included */
5269435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
5270435faa5fSBarry Smith 	fm = n;
5271435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
5272435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5273435faa5fSBarry Smith 	fill[fm]   = prow;
5274435faa5fSBarry Smith 	im[prow]   = 0;
5275435faa5fSBarry Smith 	nzf++;
5276335d9088SBarry Smith 	dcount++;
5277435faa5fSBarry Smith       }
5278435faa5fSBarry Smith 
52794e2b4712SSatish Balay       nzi = 0;
52804e2b4712SSatish Balay       row = fill[n];
52814e2b4712SSatish Balay       while (row < prow) {
52824e2b4712SSatish Balay 	incrlev = im[row] + 1;
52834e2b4712SSatish Balay 	nz      = dloc[row];
5284435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
52854e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
52864e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
52874e2b4712SSatish Balay 	fm      = row;
52884e2b4712SSatish Balay 	while (nnz-- > 0) {
52894e2b4712SSatish Balay 	  idx = *xi++;
52904e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
52914e2b4712SSatish Balay 	    flev++;
52924e2b4712SSatish Balay 	    continue;
52934e2b4712SSatish Balay 	  }
52944e2b4712SSatish Balay 	  do {
52954e2b4712SSatish Balay 	    m  = fm;
52964e2b4712SSatish Balay 	    fm = fill[m];
52974e2b4712SSatish Balay 	  } while (fm < idx);
52984e2b4712SSatish Balay 	  if (fm != idx) {
52994e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
53004e2b4712SSatish Balay 	    fill[m]   = idx;
53014e2b4712SSatish Balay 	    fill[idx] = fm;
53024e2b4712SSatish Balay 	    fm        = idx;
53034e2b4712SSatish Balay 	    nzf++;
5304ecf371e4SBarry Smith 	  } else {
53054e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
53064e2b4712SSatish Balay 	  }
53074e2b4712SSatish Balay 	  flev++;
53084e2b4712SSatish Balay 	}
53094e2b4712SSatish Balay 	row = fill[row];
53104e2b4712SSatish Balay 	nzi++;
53114e2b4712SSatish Balay       }
53124e2b4712SSatish Balay       /* copy new filled row into permanent storage */
53134e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
53144e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
5315ecf371e4SBarry Smith 
5316ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
5317ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5318ecf371e4SBarry Smith 	/* just double the memory each time */
5319690b6cddSBarry Smith 	PetscInt maxadd = jmax;
5320ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
53214e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
53224e2b4712SSatish Balay 	jmax += maxadd;
5323ecf371e4SBarry Smith 
5324ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
53255d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
53265d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5327606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
53285d0c19d7SBarry Smith 	ajnew = xitmp;
53295d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
53305d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5331606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
53325d0c19d7SBarry Smith 	ajfill = xitmp;
5333eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
53344e2b4712SSatish Balay       }
53355d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
53364e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
53374e2b4712SSatish Balay       dloc[prow]  = nzi;
53384e2b4712SSatish Balay       fm          = fill[n];
53394e2b4712SSatish Balay       while (nzf--) {
53405d0c19d7SBarry Smith 	*xitmp++ = fm;
53414e2b4712SSatish Balay 	*flev++ = im[fm];
53424e2b4712SSatish Balay 	fm      = fill[fm];
53434e2b4712SSatish Balay       }
5344435faa5fSBarry Smith       /* make sure row has diagonal entry */
5345435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
534677431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
53472401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5348435faa5fSBarry Smith       }
53494e2b4712SSatish Balay     }
5350606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
53514e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
53524e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5353606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
5354606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
53554e2b4712SSatish Balay 
53566cf91177SBarry Smith #if defined(PETSC_USE_INFO)
53574e2b4712SSatish Balay     {
5358329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5359ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5360ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5361ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5362ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5363335d9088SBarry Smith       if (diagonal_fill) {
5364ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5365335d9088SBarry Smith       }
53664e2b4712SSatish Balay     }
536763ba0a88SBarry Smith #endif
53684e2b4712SSatish Balay 
53694e2b4712SSatish Balay     /* put together the new matrix */
5370719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5371719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5372719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
5373e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
5374e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
53757c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
5376a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
53774e2b4712SSatish Balay     b->j          = ajnew;
53784e2b4712SSatish Balay     b->i          = ainew;
53794e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
53804e2b4712SSatish Balay     b->diag       = dloc;
53817f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
53824e2b4712SSatish Balay     b->ilen       = 0;
53834e2b4712SSatish Balay     b->imax       = 0;
53844e2b4712SSatish Balay     b->row        = isrow;
53854e2b4712SSatish Balay     b->col        = iscol;
5386bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5387c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5388c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5389e51c0b9cSSatish Balay     b->icol       = isicol;
539087828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
53914e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
53924e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
5393719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
53944e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
53954e2b4712SSatish Balay 
5396719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
5397719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
5398719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
53996bce7ff8SHong Zhang 
540041df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
54018661488fSKris Buschelman   PetscFunctionReturn(0);
54028661488fSKris Buschelman }
54038661488fSKris Buschelman 
5404732ee342SKris Buschelman #undef __FUNCT__
54057e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5406dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
54077e7071cdSKris Buschelman {
540812272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
540912272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
54105a9542e3SKris Buschelman   PetscFunctionBegin;
54117cf1b8d3SKris Buschelman   /* Undo Column scaling */
54127cf1b8d3SKris Buschelman /*    while (nz--) { */
54137cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
54147cf1b8d3SKris Buschelman /*    } */
5415c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
5416c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
54177cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
54187cf1b8d3SKris Buschelman }
54197cf1b8d3SKris Buschelman 
54207cf1b8d3SKris Buschelman #undef __FUNCT__
54217cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5422dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
54237cf1b8d3SKris Buschelman {
54247cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5425b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
54262aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
54275a9542e3SKris Buschelman   PetscFunctionBegin;
54280b9da03eSKris Buschelman   /* Is this really necessary? */
542920235379SKris Buschelman   while (nz--) {
54300b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
54317e7071cdSKris Buschelman   }
5432c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
54337e7071cdSKris Buschelman   PetscFunctionReturn(0);
54347e7071cdSKris Buschelman }
54357e7071cdSKris Buschelman 
5436732ee342SKris Buschelman 
5437