xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision a2d6a19a1cf2c01243ae36570ee9e459234ca5ff)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120f1af5d2fSBarry Smith {
121f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122dfbe8321SBarry Smith   PetscErrorCode ierr;
123690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
125f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12787828ca2SBarry Smith   PetscScalar    *x,*b;
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   PetscFunctionBegin;
130ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith   /* forward solve the U^T */
135f1af5d2fSBarry Smith   idx = 0;
136f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
139f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
140ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144f1af5d2fSBarry Smith     v += 9;
145f1af5d2fSBarry Smith 
146f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
147f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
148f1af5d2fSBarry Smith     while (nz--) {
149f1af5d2fSBarry Smith       oidx = 3*(*vi++);
150f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153f1af5d2fSBarry Smith       v  += 9;
154f1af5d2fSBarry Smith     }
155f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156f1af5d2fSBarry Smith     idx += 3;
157f1af5d2fSBarry Smith   }
158f1af5d2fSBarry Smith   /* backward solve the L^T */
159f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
160f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
161f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
162f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
163f1af5d2fSBarry Smith     idt  = 3*i;
164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165f1af5d2fSBarry Smith     while (nz--) {
166f1af5d2fSBarry Smith       idx   = 3*(*vi--);
167f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170f1af5d2fSBarry Smith       v -= 9;
171f1af5d2fSBarry Smith     }
172f1af5d2fSBarry Smith   }
1731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176f1af5d2fSBarry Smith   PetscFunctionReturn(0);
177f1af5d2fSBarry Smith }
178f1af5d2fSBarry Smith 
1794a2ae208SSatish Balay #undef __FUNCT__
1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182f1af5d2fSBarry Smith {
183f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184dfbe8321SBarry Smith   PetscErrorCode ierr;
185690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
187f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18987828ca2SBarry Smith   PetscScalar    *x,*b;
190f1af5d2fSBarry Smith 
191f1af5d2fSBarry Smith   PetscFunctionBegin;
192ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith   /* forward solve the U^T */
197f1af5d2fSBarry Smith   idx = 0;
198f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
199f1af5d2fSBarry Smith 
200f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
201f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
202ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207f1af5d2fSBarry Smith     v += 16;
208f1af5d2fSBarry Smith 
209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
211f1af5d2fSBarry Smith     while (nz--) {
212f1af5d2fSBarry Smith       oidx = 4*(*vi++);
213f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217f1af5d2fSBarry Smith       v  += 16;
218f1af5d2fSBarry Smith     }
219f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220f1af5d2fSBarry Smith     idx += 4;
221f1af5d2fSBarry Smith   }
222f1af5d2fSBarry Smith   /* backward solve the L^T */
223f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
224f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
225f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
226f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
227f1af5d2fSBarry Smith     idt  = 4*i;
228f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229f1af5d2fSBarry Smith     while (nz--) {
230f1af5d2fSBarry Smith       idx   = 4*(*vi--);
231f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235f1af5d2fSBarry Smith       v -= 16;
236f1af5d2fSBarry Smith     }
237f1af5d2fSBarry Smith   }
2381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241f1af5d2fSBarry Smith   PetscFunctionReturn(0);
242f1af5d2fSBarry Smith }
243f1af5d2fSBarry Smith 
2444a2ae208SSatish Balay #undef __FUNCT__
2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247f1af5d2fSBarry Smith {
248f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249dfbe8321SBarry Smith   PetscErrorCode ierr;
250690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
252f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25487828ca2SBarry Smith   PetscScalar    *x,*b;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith   PetscFunctionBegin;
257ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2581ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2591ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith   /* forward solve the U^T */
262f1af5d2fSBarry Smith   idx = 0;
263f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
266f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
267ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273f1af5d2fSBarry Smith     v += 25;
274f1af5d2fSBarry Smith 
275f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
276f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
277f1af5d2fSBarry Smith     while (nz--) {
278f1af5d2fSBarry Smith       oidx = 5*(*vi++);
279f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284f1af5d2fSBarry Smith       v  += 25;
285f1af5d2fSBarry Smith     }
286f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287f1af5d2fSBarry Smith     idx += 5;
288f1af5d2fSBarry Smith   }
289f1af5d2fSBarry Smith   /* backward solve the L^T */
290f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
291f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
292f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
293f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
294f1af5d2fSBarry Smith     idt  = 5*i;
295f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296f1af5d2fSBarry Smith     while (nz--) {
297f1af5d2fSBarry Smith       idx   = 5*(*vi--);
298f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303f1af5d2fSBarry Smith       v -= 25;
304f1af5d2fSBarry Smith     }
305f1af5d2fSBarry Smith   }
3061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309f1af5d2fSBarry Smith   PetscFunctionReturn(0);
310f1af5d2fSBarry Smith }
311f1af5d2fSBarry Smith 
3124a2ae208SSatish Balay #undef __FUNCT__
3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315f1af5d2fSBarry Smith {
316f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317dfbe8321SBarry Smith   PetscErrorCode ierr;
318690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
320f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
32187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32287828ca2SBarry Smith   PetscScalar    *x,*b;
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith   PetscFunctionBegin;
325ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith   /* forward solve the U^T */
330f1af5d2fSBarry Smith   idx = 0;
331f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
332f1af5d2fSBarry Smith 
333f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
334f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
335ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336ef66eb69SBarry Smith     x6    = x[5+idx];
337f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343f1af5d2fSBarry Smith     v += 36;
344f1af5d2fSBarry Smith 
345f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
346f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
347f1af5d2fSBarry Smith     while (nz--) {
348f1af5d2fSBarry Smith       oidx = 6*(*vi++);
349f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355f1af5d2fSBarry Smith       v  += 36;
356f1af5d2fSBarry Smith     }
357f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358f1af5d2fSBarry Smith     x[5+idx] = s6;
359f1af5d2fSBarry Smith     idx += 6;
360f1af5d2fSBarry Smith   }
361f1af5d2fSBarry Smith   /* backward solve the L^T */
362f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
363f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
364f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
365f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
366f1af5d2fSBarry Smith     idt  = 6*i;
367f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368f1af5d2fSBarry Smith     s6 = x[5+idt];
369f1af5d2fSBarry Smith     while (nz--) {
370f1af5d2fSBarry Smith       idx   = 6*(*vi--);
371f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377f1af5d2fSBarry Smith       v -= 36;
378f1af5d2fSBarry Smith     }
379f1af5d2fSBarry Smith   }
3801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383f1af5d2fSBarry Smith   PetscFunctionReturn(0);
384f1af5d2fSBarry Smith }
385f1af5d2fSBarry Smith 
3864a2ae208SSatish Balay #undef __FUNCT__
3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389f1af5d2fSBarry Smith {
390f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391dfbe8321SBarry Smith   PetscErrorCode ierr;
392690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
394f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39687828ca2SBarry Smith   PetscScalar    *x,*b;
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith   PetscFunctionBegin;
399ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith   /* forward solve the U^T */
404f1af5d2fSBarry Smith   idx = 0;
405f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
406f1af5d2fSBarry Smith 
407f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
408f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
409ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
411f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418f1af5d2fSBarry Smith     v += 49;
419f1af5d2fSBarry Smith 
420f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
421f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
422f1af5d2fSBarry Smith     while (nz--) {
423f1af5d2fSBarry Smith       oidx = 7*(*vi++);
424f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431f1af5d2fSBarry Smith       v  += 49;
432f1af5d2fSBarry Smith     }
433f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
435f1af5d2fSBarry Smith     idx += 7;
436f1af5d2fSBarry Smith   }
437f1af5d2fSBarry Smith   /* backward solve the L^T */
438f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
439f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
440f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
441f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
442f1af5d2fSBarry Smith     idt  = 7*i;
443f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
445f1af5d2fSBarry Smith     while (nz--) {
446f1af5d2fSBarry Smith       idx   = 7*(*vi--);
447f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454f1af5d2fSBarry Smith       v -= 49;
455f1af5d2fSBarry Smith     }
456f1af5d2fSBarry Smith   }
4571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460f1af5d2fSBarry Smith   PetscFunctionReturn(0);
461f1af5d2fSBarry Smith }
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4644a2ae208SSatish Balay #undef __FUNCT__
4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467f1af5d2fSBarry Smith {
468f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
4715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473690b6cddSBarry Smith   PetscInt       *diag = a->diag;
474f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47587828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   PetscFunctionBegin;
4781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480f1af5d2fSBarry Smith   t  = a->solve_work;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith     t[i] = b[c[i]];
488f1af5d2fSBarry Smith   }
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith   /* forward solve the U^T */
491f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith     v     = aa + diag[i];
494f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
495f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith     t[i]   = s1;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     s1   = t[i];
509f1af5d2fSBarry Smith     while (nz--) {
510f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   /* copy t into x according to permutation */
515f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
516f1af5d2fSBarry Smith     x[r[i]]   = t[i];
517f1af5d2fSBarry Smith   }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5211ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   PetscFunctionReturn(0);
525f1af5d2fSBarry Smith }
526f1af5d2fSBarry Smith 
5274a2ae208SSatish Balay #undef __FUNCT__
5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530f1af5d2fSBarry Smith {
531f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5336849ba73SBarry Smith   PetscErrorCode ierr;
5345d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53887828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   PetscFunctionBegin;
5421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544f1af5d2fSBarry Smith   t  = a->solve_work;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
550f1af5d2fSBarry Smith   ii = 0;
551f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
552f1af5d2fSBarry Smith     ic      = 2*c[i];
553f1af5d2fSBarry Smith     t[ii]   = b[ic];
554f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
555f1af5d2fSBarry Smith     ii += 2;
556f1af5d2fSBarry Smith   }
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith   /* forward solve the U^T */
559f1af5d2fSBarry Smith   idx = 0;
560f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
561f1af5d2fSBarry Smith 
562f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
563f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
564f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
565f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
566f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
567f1af5d2fSBarry Smith     v += 4;
568f1af5d2fSBarry Smith 
569f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
570f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
571f1af5d2fSBarry Smith     while (nz--) {
572f1af5d2fSBarry Smith       oidx = 2*(*vi++);
573f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575f1af5d2fSBarry Smith       v  += 4;
576f1af5d2fSBarry Smith     }
577f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
578f1af5d2fSBarry Smith     idx += 2;
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith   /* backward solve the L^T */
581f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
582f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
583f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
584f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
585f1af5d2fSBarry Smith     idt  = 2*i;
586f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
587f1af5d2fSBarry Smith     while (nz--) {
588f1af5d2fSBarry Smith       idx   = 2*(*vi--);
589f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591f1af5d2fSBarry Smith       v -= 4;
592f1af5d2fSBarry Smith     }
593f1af5d2fSBarry Smith   }
594f1af5d2fSBarry Smith 
595f1af5d2fSBarry Smith   /* copy t into x according to permutation */
596f1af5d2fSBarry Smith   ii = 0;
597f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
598f1af5d2fSBarry Smith     ir      = 2*r[i];
599f1af5d2fSBarry Smith     x[ir]   = t[ii];
600f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
601f1af5d2fSBarry Smith     ii += 2;
602f1af5d2fSBarry Smith   }
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609f1af5d2fSBarry Smith   PetscFunctionReturn(0);
610f1af5d2fSBarry Smith }
611f1af5d2fSBarry Smith 
6124a2ae208SSatish Balay #undef __FUNCT__
6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615f1af5d2fSBarry Smith {
616f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6186849ba73SBarry Smith   PetscErrorCode ierr;
6195d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   PetscFunctionBegin;
6271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629f1af5d2fSBarry Smith   t  = a->solve_work;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
635f1af5d2fSBarry Smith   ii = 0;
636f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
637f1af5d2fSBarry Smith     ic      = 3*c[i];
638f1af5d2fSBarry Smith     t[ii]   = b[ic];
639f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
640f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
641f1af5d2fSBarry Smith     ii += 3;
642f1af5d2fSBarry Smith   }
643f1af5d2fSBarry Smith 
644f1af5d2fSBarry Smith   /* forward solve the U^T */
645f1af5d2fSBarry Smith   idx = 0;
646f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
647f1af5d2fSBarry Smith 
648f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
649f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
650f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654f1af5d2fSBarry Smith     v += 9;
655f1af5d2fSBarry Smith 
656f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
657f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
658f1af5d2fSBarry Smith     while (nz--) {
659f1af5d2fSBarry Smith       oidx = 3*(*vi++);
660f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663f1af5d2fSBarry Smith       v  += 9;
664f1af5d2fSBarry Smith     }
665f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666f1af5d2fSBarry Smith     idx += 3;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 3*i;
674f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 3*(*vi--);
677f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680f1af5d2fSBarry Smith       v -= 9;
681f1af5d2fSBarry Smith     }
682f1af5d2fSBarry Smith   }
683f1af5d2fSBarry Smith 
684f1af5d2fSBarry Smith   /* copy t into x according to permutation */
685f1af5d2fSBarry Smith   ii = 0;
686f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
687f1af5d2fSBarry Smith     ir      = 3*r[i];
688f1af5d2fSBarry Smith     x[ir]   = t[ii];
689f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
690f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
691f1af5d2fSBarry Smith     ii += 3;
692f1af5d2fSBarry Smith   }
693f1af5d2fSBarry Smith 
694f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699f1af5d2fSBarry Smith   PetscFunctionReturn(0);
700f1af5d2fSBarry Smith }
701f1af5d2fSBarry Smith 
7024a2ae208SSatish Balay #undef __FUNCT__
7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705f1af5d2fSBarry Smith {
706f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7086849ba73SBarry Smith   PetscErrorCode ierr;
7095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   PetscFunctionBegin;
7171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719f1af5d2fSBarry Smith   t  = a->solve_work;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
725f1af5d2fSBarry Smith   ii = 0;
726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
727f1af5d2fSBarry Smith     ic      = 4*c[i];
728f1af5d2fSBarry Smith     t[ii]   = b[ic];
729f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
730f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
731f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
732f1af5d2fSBarry Smith     ii += 4;
733f1af5d2fSBarry Smith   }
734f1af5d2fSBarry Smith 
735f1af5d2fSBarry Smith   /* forward solve the U^T */
736f1af5d2fSBarry Smith   idx = 0;
737f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
738f1af5d2fSBarry Smith 
739f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
740f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
741f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746f1af5d2fSBarry Smith     v += 16;
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
749f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       oidx = 4*(*vi++);
752f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v  += 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759f1af5d2fSBarry Smith     idx += 4;
760f1af5d2fSBarry Smith   }
761f1af5d2fSBarry Smith   /* backward solve the L^T */
762f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
763f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
764f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
765f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
766f1af5d2fSBarry Smith     idt  = 4*i;
767f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768f1af5d2fSBarry Smith     while (nz--) {
769f1af5d2fSBarry Smith       idx   = 4*(*vi--);
770f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774f1af5d2fSBarry Smith       v -= 16;
775f1af5d2fSBarry Smith     }
776f1af5d2fSBarry Smith   }
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   /* copy t into x according to permutation */
779f1af5d2fSBarry Smith   ii = 0;
780f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
781f1af5d2fSBarry Smith     ir      = 4*r[i];
782f1af5d2fSBarry Smith     x[ir]   = t[ii];
783f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
784f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
785f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
786f1af5d2fSBarry Smith     ii += 4;
787f1af5d2fSBarry Smith   }
788f1af5d2fSBarry Smith 
789f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   PetscFunctionReturn(0);
795f1af5d2fSBarry Smith }
796f1af5d2fSBarry Smith 
7974a2ae208SSatish Balay #undef __FUNCT__
7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800f1af5d2fSBarry Smith {
801f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8036849ba73SBarry Smith   PetscErrorCode ierr;
8045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   PetscFunctionBegin;
8121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814f1af5d2fSBarry Smith   t  = a->solve_work;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
820f1af5d2fSBarry Smith   ii = 0;
821f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
822f1af5d2fSBarry Smith     ic      = 5*c[i];
823f1af5d2fSBarry Smith     t[ii]   = b[ic];
824f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
825f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
826f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
827f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
828f1af5d2fSBarry Smith     ii += 5;
829f1af5d2fSBarry Smith   }
830f1af5d2fSBarry Smith 
831f1af5d2fSBarry Smith   /* forward solve the U^T */
832f1af5d2fSBarry Smith   idx = 0;
833f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
834f1af5d2fSBarry Smith 
835f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
836f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
837f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843f1af5d2fSBarry Smith     v += 25;
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
846f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       oidx = 5*(*vi++);
849f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v  += 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857f1af5d2fSBarry Smith     idx += 5;
858f1af5d2fSBarry Smith   }
859f1af5d2fSBarry Smith   /* backward solve the L^T */
860f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
861f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
862f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
863f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
864f1af5d2fSBarry Smith     idt  = 5*i;
865f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866f1af5d2fSBarry Smith     while (nz--) {
867f1af5d2fSBarry Smith       idx   = 5*(*vi--);
868f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873f1af5d2fSBarry Smith       v -= 25;
874f1af5d2fSBarry Smith     }
875f1af5d2fSBarry Smith   }
876f1af5d2fSBarry Smith 
877f1af5d2fSBarry Smith   /* copy t into x according to permutation */
878f1af5d2fSBarry Smith   ii = 0;
879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
880f1af5d2fSBarry Smith     ir      = 5*r[i];
881f1af5d2fSBarry Smith     x[ir]   = t[ii];
882f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
883f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
884f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
885f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
886f1af5d2fSBarry Smith     ii += 5;
887f1af5d2fSBarry Smith   }
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894f1af5d2fSBarry Smith   PetscFunctionReturn(0);
895f1af5d2fSBarry Smith }
896f1af5d2fSBarry Smith 
8974a2ae208SSatish Balay #undef __FUNCT__
8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900f1af5d2fSBarry Smith {
901f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9036849ba73SBarry Smith   PetscErrorCode ierr;
9045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   PetscFunctionBegin;
9121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914f1af5d2fSBarry Smith   t  = a->solve_work;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
920f1af5d2fSBarry Smith   ii = 0;
921f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
922f1af5d2fSBarry Smith     ic      = 6*c[i];
923f1af5d2fSBarry Smith     t[ii]   = b[ic];
924f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
925f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
926f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
927f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
928f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
929f1af5d2fSBarry Smith     ii += 6;
930f1af5d2fSBarry Smith   }
931f1af5d2fSBarry Smith 
932f1af5d2fSBarry Smith   /* forward solve the U^T */
933f1af5d2fSBarry Smith   idx = 0;
934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
937f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
938f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939f1af5d2fSBarry Smith     x6    = t[5+idx];
940f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946f1af5d2fSBarry Smith     v += 36;
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
949f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       oidx = 6*(*vi++);
952f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v  += 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961f1af5d2fSBarry Smith     t[5+idx] = s6;
962f1af5d2fSBarry Smith     idx += 6;
963f1af5d2fSBarry Smith   }
964f1af5d2fSBarry Smith   /* backward solve the L^T */
965f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
966f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
967f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
968f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
969f1af5d2fSBarry Smith     idt  = 6*i;
970f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971f1af5d2fSBarry Smith     s6 = t[5+idt];
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       idx   = 6*(*vi--);
974f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980f1af5d2fSBarry Smith       v -= 36;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith   }
983f1af5d2fSBarry Smith 
984f1af5d2fSBarry Smith   /* copy t into x according to permutation */
985f1af5d2fSBarry Smith   ii = 0;
986f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
987f1af5d2fSBarry Smith     ir      = 6*r[i];
988f1af5d2fSBarry Smith     x[ir]   = t[ii];
989f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
990f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
991f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
992f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
993f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
994f1af5d2fSBarry Smith     ii += 6;
995f1af5d2fSBarry Smith   }
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1003f1af5d2fSBarry Smith }
1004f1af5d2fSBarry Smith 
10054a2ae208SSatish Balay #undef __FUNCT__
10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008f1af5d2fSBarry Smith {
1009f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10116849ba73SBarry Smith   PetscErrorCode ierr;
10125d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   PetscFunctionBegin;
10201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022f1af5d2fSBarry Smith   t  = a->solve_work;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1028f1af5d2fSBarry Smith   ii = 0;
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     ic      = 7*c[i];
1031f1af5d2fSBarry Smith     t[ii]   = b[ic];
1032f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1033f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1034f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1035f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1036f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1037f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1038f1af5d2fSBarry Smith     ii += 7;
1039f1af5d2fSBarry Smith   }
1040f1af5d2fSBarry Smith 
1041f1af5d2fSBarry Smith   /* forward solve the U^T */
1042f1af5d2fSBarry Smith   idx = 0;
1043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1044f1af5d2fSBarry Smith 
1045f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1046f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1047f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1049f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056f1af5d2fSBarry Smith     v += 49;
1057f1af5d2fSBarry Smith 
1058f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1059f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1062f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v  += 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1073f1af5d2fSBarry Smith     idx += 7;
1074f1af5d2fSBarry Smith   }
1075f1af5d2fSBarry Smith   /* backward solve the L^T */
1076f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1077f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1078f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1079f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1080f1af5d2fSBarry Smith     idt  = 7*i;
1081f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1083f1af5d2fSBarry Smith     while (nz--) {
1084f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1085f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092f1af5d2fSBarry Smith       v -= 49;
1093f1af5d2fSBarry Smith     }
1094f1af5d2fSBarry Smith   }
1095f1af5d2fSBarry Smith 
1096f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1097f1af5d2fSBarry Smith   ii = 0;
1098f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1099f1af5d2fSBarry Smith     ir      = 7*r[i];
1100f1af5d2fSBarry Smith     x[ir]   = t[ii];
1101f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1102f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1103f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1104f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1105f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1106f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1107f1af5d2fSBarry Smith     ii += 7;
1108f1af5d2fSBarry Smith   }
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1116f1af5d2fSBarry Smith }
1117f1af5d2fSBarry Smith 
11184e2b4712SSatish Balay /* ----------------------------------------------------------- */
11194a2ae208SSatish Balay #undef __FUNCT__
11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11224e2b4712SSatish Balay {
11234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11256849ba73SBarry Smith   PetscErrorCode ierr;
11265d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11285d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11293f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
113087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11314e2b4712SSatish Balay 
11324e2b4712SSatish Balay   PetscFunctionBegin;
11331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135f1af5d2fSBarry Smith   t  = a->solve_work;
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11394e2b4712SSatish Balay 
11404e2b4712SSatish Balay   /* forward solve the lower triangular */
114187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11424e2b4712SSatish Balay   for (i=1; i<n; i++) {
11434e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11444e2b4712SSatish Balay     vi  = aj + ai[i];
11454e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1146f1af5d2fSBarry Smith     s = t + bs*i;
114787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
11524e2b4712SSatish Balay   }
11534e2b4712SSatish Balay   /* backward solve the upper triangular */
1154d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11564e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11574e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11584e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11604e2b4712SSatish Balay     while (nz--) {
1161f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11624e2b4712SSatish Balay       v += bs2;
11634e2b4712SSatish Balay     }
1164f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11664e2b4712SSatish Balay   }
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11734e2b4712SSatish Balay   PetscFunctionReturn(0);
11744e2b4712SSatish Balay }
11754e2b4712SSatish Balay 
11765c42ef9dSBarry Smith /* ----------------------------------------------------------- */
11775c42ef9dSBarry Smith #undef __FUNCT__
11785c42ef9dSBarry Smith #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
11795c42ef9dSBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11805c42ef9dSBarry Smith {
11815c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
11825c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
11835c42ef9dSBarry Smith   PetscErrorCode    ierr;
11845c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11855c42ef9dSBarry Smith   PetscInt          i,n=a->mbs,j;
11865c42ef9dSBarry Smith   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
11875c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
11885c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
11895c42ef9dSBarry Smith   const PetscScalar *b;
11905c42ef9dSBarry Smith   PetscFunctionBegin;
11915c42ef9dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
11925c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
11935c42ef9dSBarry Smith   t    = a->solve_work;
11945c42ef9dSBarry Smith 
11955c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11965c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
11975c42ef9dSBarry Smith 
11985c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
11995c42ef9dSBarry Smith   for (i=0; i<n; i++) {
12005c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
12015c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
12025c42ef9dSBarry Smith     }
12035c42ef9dSBarry Smith   }
12045c42ef9dSBarry Smith 
12055c42ef9dSBarry Smith 
12065c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
12075c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
12085c42ef9dSBarry Smith   for (i=0; i<n; i++){
12095c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
12105c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
12115c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
12125c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
12135c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
12145c42ef9dSBarry Smith     while (nz--) {
12155c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
12165c42ef9dSBarry Smith       v += bs2;
12175c42ef9dSBarry Smith     }
12185c42ef9dSBarry Smith   }
12195c42ef9dSBarry Smith 
12205c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
12215c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
12225c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
12235c42ef9dSBarry Smith     vi  = aj + ai[i];
12245c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
12255c42ef9dSBarry Smith     while (nz--) {
12265c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
12275c42ef9dSBarry Smith       v += bs2;
12285c42ef9dSBarry Smith     }
12295c42ef9dSBarry Smith   }
12305c42ef9dSBarry Smith 
12315c42ef9dSBarry Smith   /* copy t into x according to permutation */
12325c42ef9dSBarry Smith   for (i=0; i<n; i++) {
12335c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
12345c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
12355c42ef9dSBarry Smith     }
12365c42ef9dSBarry Smith   }
12375c42ef9dSBarry Smith 
12385c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12395c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12405c42ef9dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12415c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
12425c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
12435c42ef9dSBarry Smith   PetscFunctionReturn(0);
12445c42ef9dSBarry Smith }
12455c42ef9dSBarry Smith 
12464a2ae208SSatish Balay #undef __FUNCT__
12474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1248dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
12494e2b4712SSatish Balay {
12504e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
12514e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
12526849ba73SBarry Smith   PetscErrorCode ierr;
12535d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
12545d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
12553f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
125687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
125787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
12584e2b4712SSatish Balay 
12594e2b4712SSatish Balay   PetscFunctionBegin;
12601ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12611ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1262f1af5d2fSBarry Smith   t  = a->solve_work;
12634e2b4712SSatish Balay 
12644e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
12654e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
12664e2b4712SSatish Balay 
12674e2b4712SSatish Balay   /* forward solve the lower triangular */
12684e2b4712SSatish Balay   idx    = 7*(*r++);
1269f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1270f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1271f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12724e2b4712SSatish Balay 
12734e2b4712SSatish Balay   for (i=1; i<n; i++) {
12744e2b4712SSatish Balay     v     = aa + 49*ai[i];
12754e2b4712SSatish Balay     vi    = aj + ai[i];
12764e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12774e2b4712SSatish Balay     idx   = 7*(*r++);
1278f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1279f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12804e2b4712SSatish Balay     while (nz--) {
12814e2b4712SSatish Balay       idx   = 7*(*vi++);
1282f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1283f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1284f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1285f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1286f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1287f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1288f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1289f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1290f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1291f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12924e2b4712SSatish Balay       v += 49;
12934e2b4712SSatish Balay     }
12944e2b4712SSatish Balay     idx = 7*i;
1295f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1296f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1297f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12984e2b4712SSatish Balay   }
12994e2b4712SSatish Balay   /* backward solve the upper triangular */
13004e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
13014e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
13024e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
13034e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
13044e2b4712SSatish Balay     idt  = 7*i;
1305f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1306f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1307f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
13084e2b4712SSatish Balay     while (nz--) {
13094e2b4712SSatish Balay       idx   = 7*(*vi++);
1310f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1311f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1312f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1313f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1314f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1315f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1316f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1317f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1318f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1319f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13204e2b4712SSatish Balay       v += 49;
13214e2b4712SSatish Balay     }
13224e2b4712SSatish Balay     idc = 7*(*c--);
13234e2b4712SSatish Balay     v   = aa + 49*diag[i];
1324f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1325f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1326f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1327f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1328f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1329f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1330f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1331f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1332f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1333f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1334f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1335f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1336f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1337f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13384e2b4712SSatish Balay   }
13394e2b4712SSatish Balay 
13404e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
13414e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13421ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1344dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
13454e2b4712SSatish Balay   PetscFunctionReturn(0);
13464e2b4712SSatish Balay }
13474e2b4712SSatish Balay 
1348*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
13494a2ae208SSatish Balay #undef __FUNCT__
13508f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
13518f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
13528f690400SShri Abhyankar {
13538f690400SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
13548f690400SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
13558f690400SShri Abhyankar   PetscErrorCode ierr;
13568f690400SShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
135729b92fc1SShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
13588f690400SShri Abhyankar   MatScalar      *aa=a->a,*v;
13598f690400SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
13608f690400SShri Abhyankar   PetscScalar    *x,*b,*t;
13618f690400SShri Abhyankar 
13628f690400SShri Abhyankar   PetscFunctionBegin;
13638f690400SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
13648f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
13658f690400SShri Abhyankar   t  = a->solve_work;
13668f690400SShri Abhyankar 
13678f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
136829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
13698f690400SShri Abhyankar 
13708f690400SShri Abhyankar   /* forward solve the lower triangular */
137129b92fc1SShri Abhyankar   idx    = 7*r[0];
13728f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
13738f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
13748f690400SShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
13758f690400SShri Abhyankar 
13768f690400SShri Abhyankar   for (i=1; i<n; i++) {
13778f690400SShri Abhyankar     v     = aa + 49*ai[i];
13788f690400SShri Abhyankar     vi    = aj + ai[i];
13798f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
138029b92fc1SShri Abhyankar     idx   = 7*r[i];
13818f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
13828f690400SShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
138329b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
138429b92fc1SShri Abhyankar       idx   = 7*vi[m];
13858f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
13868f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
13878f690400SShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
13888f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13898f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13908f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13918f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13928f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13938f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13948f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13958f690400SShri Abhyankar       v += 49;
13968f690400SShri Abhyankar     }
13978f690400SShri Abhyankar     idx = 7*i;
13988f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
13998f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
14008f690400SShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
14018f690400SShri Abhyankar   }
14028f690400SShri Abhyankar   /* backward solve the upper triangular */
14038f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
14048f690400SShri Abhyankar     k    = 2*n-i;
14058f690400SShri Abhyankar     v    = aa + 49*ai[k];
14068f690400SShri Abhyankar     vi   = aj + ai[k];
14078f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
14088f690400SShri Abhyankar     idt  = 7*i;
14098f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
14108f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
14118f690400SShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
141229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
141329b92fc1SShri Abhyankar       idx   = 7*vi[m];
14148f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
14158f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
14168f690400SShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
14178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
14188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
14198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
14208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
14218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
14228f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
14238f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
14248f690400SShri Abhyankar       v += 49;
14258f690400SShri Abhyankar     }
142629b92fc1SShri Abhyankar     idc = 7*c[i];
14278f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
14288f690400SShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
14298f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
14308f690400SShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
14318f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
14328f690400SShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
14338f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
14348f690400SShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
14358f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
14368f690400SShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
14378f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
14388f690400SShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
14398f690400SShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
14408f690400SShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
14418f690400SShri Abhyankar   }
14428f690400SShri Abhyankar 
14438f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
14448f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14458f690400SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
14468f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
14478f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
14488f690400SShri Abhyankar   PetscFunctionReturn(0);
14498f690400SShri Abhyankar }
1450*a2d6a19aSShri Abhyankar #endif
14518f690400SShri Abhyankar 
14528f690400SShri Abhyankar #undef __FUNCT__
1453*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1454*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
145535aa4fcfSShri Abhyankar {
145635aa4fcfSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
145735aa4fcfSShri Abhyankar   IS             iscol=a->col,isrow=a->row;
145835aa4fcfSShri Abhyankar   PetscErrorCode ierr;
145935aa4fcfSShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
146035aa4fcfSShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
146135aa4fcfSShri Abhyankar   MatScalar      *aa=a->a,*v;
146235aa4fcfSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
146335aa4fcfSShri Abhyankar   PetscScalar    *x,*b,*t;
146435aa4fcfSShri Abhyankar 
146535aa4fcfSShri Abhyankar   PetscFunctionBegin;
146635aa4fcfSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
146735aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
146835aa4fcfSShri Abhyankar   t  = a->solve_work;
146935aa4fcfSShri Abhyankar 
147035aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
147135aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
147235aa4fcfSShri Abhyankar 
147335aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
147435aa4fcfSShri Abhyankar   idx    = 7*r[0];
147535aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
147635aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
147735aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
147835aa4fcfSShri Abhyankar 
147935aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
148035aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
148135aa4fcfSShri Abhyankar     vi    = aj + ai[i];
148235aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
148335aa4fcfSShri Abhyankar     idx   = 7*r[i];
148435aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
148535aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
148635aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
148735aa4fcfSShri Abhyankar       idx   = 7*vi[m];
148835aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
148935aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
149035aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
149135aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
149235aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
149335aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
149435aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
149535aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
149635aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
149735aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
149835aa4fcfSShri Abhyankar       v += 49;
149935aa4fcfSShri Abhyankar     }
150035aa4fcfSShri Abhyankar     idx = 7*i;
150135aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
150235aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
150335aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
150435aa4fcfSShri Abhyankar   }
150535aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
150635aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
150735aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
150835aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
150935aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
151035aa4fcfSShri Abhyankar     idt  = 7*i;
151135aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
151235aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
151335aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
151435aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
151535aa4fcfSShri Abhyankar       idx   = 7*vi[m];
151635aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
151735aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
151835aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
151935aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
152035aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
152135aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
152235aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
152335aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
152435aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
152535aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
152635aa4fcfSShri Abhyankar       v += 49;
152735aa4fcfSShri Abhyankar     }
152835aa4fcfSShri Abhyankar     idc = 7*c[i];
152935aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
153035aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
153135aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
153235aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
153335aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
153435aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
153535aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
153635aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
153735aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
153835aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
153935aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
154035aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
154135aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
154235aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
154335aa4fcfSShri Abhyankar   }
154435aa4fcfSShri Abhyankar 
154535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
154635aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
154735aa4fcfSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
154835aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
154935aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
155035aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
155135aa4fcfSShri Abhyankar }
155235aa4fcfSShri Abhyankar 
155335aa4fcfSShri Abhyankar #undef __FUNCT__
15544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1555dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
155615091d37SBarry Smith {
155715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1558690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1559dfbe8321SBarry Smith   PetscErrorCode    ierr;
1560690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1561d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1562d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1563d9fead3dSBarry Smith   const PetscScalar *b;
156415091d37SBarry Smith 
156515091d37SBarry Smith   PetscFunctionBegin;
1566d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15671ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
156815091d37SBarry Smith   /* forward solve the lower triangular */
156915091d37SBarry Smith   idx    = 0;
157015091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
157115091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
157215091d37SBarry Smith   x[6] = b[6+idx];
157315091d37SBarry Smith   for (i=1; i<n; i++) {
157415091d37SBarry Smith     v     =  aa + 49*ai[i];
157515091d37SBarry Smith     vi    =  aj + ai[i];
157615091d37SBarry Smith     nz    =  diag[i] - ai[i];
157715091d37SBarry Smith     idx   =  7*i;
1578f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1579f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1580f1af5d2fSBarry Smith     s7  =  b[6+idx];
158115091d37SBarry Smith     while (nz--) {
158215091d37SBarry Smith       jdx   = 7*(*vi++);
158315091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
158415091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
158515091d37SBarry Smith       x7    = x[6+jdx];
1586f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1587f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1588f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1589f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1590f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1591f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1592f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
159315091d37SBarry Smith       v += 49;
159415091d37SBarry Smith      }
1595f1af5d2fSBarry Smith     x[idx]   = s1;
1596f1af5d2fSBarry Smith     x[1+idx] = s2;
1597f1af5d2fSBarry Smith     x[2+idx] = s3;
1598f1af5d2fSBarry Smith     x[3+idx] = s4;
1599f1af5d2fSBarry Smith     x[4+idx] = s5;
1600f1af5d2fSBarry Smith     x[5+idx] = s6;
1601f1af5d2fSBarry Smith     x[6+idx] = s7;
160215091d37SBarry Smith   }
160315091d37SBarry Smith   /* backward solve the upper triangular */
160415091d37SBarry Smith   for (i=n-1; i>=0; i--){
160515091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
160615091d37SBarry Smith     vi   = aj + diag[i] + 1;
160715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
160815091d37SBarry Smith     idt  = 7*i;
1609f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1610f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1611f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1612f1af5d2fSBarry Smith     s7 = x[6+idt];
161315091d37SBarry Smith     while (nz--) {
161415091d37SBarry Smith       idx   = 7*(*vi++);
161515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
161615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
161715091d37SBarry Smith       x7    = x[6+idx];
1618f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1619f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1620f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1621f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1622f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1623f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1624f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
162515091d37SBarry Smith       v += 49;
162615091d37SBarry Smith     }
162715091d37SBarry Smith     v        = aa + 49*diag[i];
1628f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1629f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1630f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1631f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1632f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1633f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1634f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1635f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1636f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1637f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1638f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1639f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1640f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1641f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
164215091d37SBarry Smith   }
164315091d37SBarry Smith 
1644d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16451ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1646dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
164715091d37SBarry Smith   PetscFunctionReturn(0);
164815091d37SBarry Smith }
164915091d37SBarry Smith 
1650*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
16514a2ae208SSatish Balay #undef __FUNCT__
1652cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1653cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1654cee9d6f2SShri Abhyankar {
1655cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
16566464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1657cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1658cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1659cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1660cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1661cee9d6f2SShri Abhyankar     PetscScalar       *x;
1662cee9d6f2SShri Abhyankar     const PetscScalar *b;
1663cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1664cee9d6f2SShri Abhyankar 
1665cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1666cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1667cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1669cee9d6f2SShri Abhyankar     idx    = 0;
1670cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1671cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1672cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1673cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1674cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1675cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1676cee9d6f2SShri Abhyankar       idx   = bs*i;
1677cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1678cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
16796464896eSShri Abhyankar        for(k=0;k<nz;k++) {
16806464896eSShri Abhyankar           jdx   = bs*vi[k];
1681cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1682cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1683cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1684cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1685cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1686cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1687cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1688cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1689cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1690cee9d6f2SShri Abhyankar           v   +=  bs2;
1691cee9d6f2SShri Abhyankar         }
1692cee9d6f2SShri Abhyankar 
1693cee9d6f2SShri Abhyankar        x[idx]   = s1;
1694cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1695cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1696cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1697cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1698cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1699cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1700cee9d6f2SShri Abhyankar     }
1701cee9d6f2SShri Abhyankar 
1702cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1703cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1704cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1705cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1706cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1707cee9d6f2SShri Abhyankar      idt = bs*i;
1708cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1709cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
17106464896eSShri Abhyankar     for(k=0;k<nz;k++) {
17116464896eSShri Abhyankar       idx   = bs*vi[k];
1712cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1713cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1714cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1715cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1716cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1717cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1718cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1719cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1720cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1721cee9d6f2SShri Abhyankar         v   +=  bs2;
1722cee9d6f2SShri Abhyankar     }
1723cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1724cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1725cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1726cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1727cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1728cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1729cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1730cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1731cee9d6f2SShri Abhyankar   }
1732cee9d6f2SShri Abhyankar 
1733cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1734cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1735cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1736cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1737cee9d6f2SShri Abhyankar }
1738*a2d6a19aSShri Abhyankar #endif
1739cee9d6f2SShri Abhyankar 
1740cee9d6f2SShri Abhyankar #undef __FUNCT__
1741*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1742*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
174353cca76cSShri Abhyankar {
174453cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
174553cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
174653cca76cSShri Abhyankar     PetscErrorCode    ierr;
174753cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
174853cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
174953cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
175053cca76cSShri Abhyankar     PetscScalar       *x;
175153cca76cSShri Abhyankar     const PetscScalar *b;
175253cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
175353cca76cSShri Abhyankar 
175453cca76cSShri Abhyankar     PetscFunctionBegin;
175553cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
175653cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
175753cca76cSShri Abhyankar     /* forward solve the lower triangular */
175853cca76cSShri Abhyankar     idx    = 0;
175953cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
176053cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
176153cca76cSShri Abhyankar     for (i=1; i<n; i++) {
176253cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
176353cca76cSShri Abhyankar        vi   = aj + ai[i];
176453cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
176553cca76cSShri Abhyankar       idx   = bs*i;
176653cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
176753cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
176853cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
176953cca76cSShri Abhyankar           jdx   = bs*vi[k];
177053cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
177153cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
177253cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
177353cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
177453cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
177553cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
177653cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
177753cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
177853cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
177953cca76cSShri Abhyankar           v   +=  bs2;
178053cca76cSShri Abhyankar         }
178153cca76cSShri Abhyankar 
178253cca76cSShri Abhyankar        x[idx]   = s1;
178353cca76cSShri Abhyankar        x[1+idx] = s2;
178453cca76cSShri Abhyankar        x[2+idx] = s3;
178553cca76cSShri Abhyankar        x[3+idx] = s4;
178653cca76cSShri Abhyankar        x[4+idx] = s5;
178753cca76cSShri Abhyankar        x[5+idx] = s6;
178853cca76cSShri Abhyankar        x[6+idx] = s7;
178953cca76cSShri Abhyankar     }
179053cca76cSShri Abhyankar 
179153cca76cSShri Abhyankar    /* backward solve the upper triangular */
179253cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
179353cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
179453cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
179553cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
179653cca76cSShri Abhyankar      idt = bs*i;
179753cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
179853cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
179953cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
180053cca76cSShri Abhyankar       idx   = bs*vi[k];
180153cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
180253cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
180353cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
180453cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
180553cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
180653cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
180753cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
180853cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
180953cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
181053cca76cSShri Abhyankar         v   +=  bs2;
181153cca76cSShri Abhyankar     }
181253cca76cSShri Abhyankar     /* x = inv_diagonal*x */
181353cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
181453cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
181553cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
181653cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
181753cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
181853cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
181953cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
182053cca76cSShri Abhyankar   }
182153cca76cSShri Abhyankar 
182253cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
182353cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
182453cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
182553cca76cSShri Abhyankar   PetscFunctionReturn(0);
182653cca76cSShri Abhyankar }
182753cca76cSShri Abhyankar 
182853cca76cSShri Abhyankar #undef __FUNCT__
18294a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1830dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
183115091d37SBarry Smith {
183215091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
183315091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
18346849ba73SBarry Smith   PetscErrorCode    ierr;
18355d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
18365d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1837d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1838d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1839d9fead3dSBarry Smith   const PetscScalar *b;
184015091d37SBarry Smith   PetscFunctionBegin;
1841d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18421ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1843f1af5d2fSBarry Smith   t  = a->solve_work;
184415091d37SBarry Smith 
184515091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
184615091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
184715091d37SBarry Smith 
184815091d37SBarry Smith   /* forward solve the lower triangular */
184915091d37SBarry Smith   idx    = 6*(*r++);
1850f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1851f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1852f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
185315091d37SBarry Smith   for (i=1; i<n; i++) {
185415091d37SBarry Smith     v     = aa + 36*ai[i];
185515091d37SBarry Smith     vi    = aj + ai[i];
185615091d37SBarry Smith     nz    = diag[i] - ai[i];
185715091d37SBarry Smith     idx   = 6*(*r++);
1858f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1859f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
186015091d37SBarry Smith     while (nz--) {
186115091d37SBarry Smith       idx   = 6*(*vi++);
1862f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1863f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1864f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1865f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1866f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1867f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1868f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1869f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
187015091d37SBarry Smith       v += 36;
187115091d37SBarry Smith     }
187215091d37SBarry Smith     idx = 6*i;
1873f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1874f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1875f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
187615091d37SBarry Smith   }
187715091d37SBarry Smith   /* backward solve the upper triangular */
187815091d37SBarry Smith   for (i=n-1; i>=0; i--){
187915091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
188015091d37SBarry Smith     vi   = aj + diag[i] + 1;
188115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
188215091d37SBarry Smith     idt  = 6*i;
1883f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1884f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1885f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
188615091d37SBarry Smith     while (nz--) {
188715091d37SBarry Smith       idx   = 6*(*vi++);
1888f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1889f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1890f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1891f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1892f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1893f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1894f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1895f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1896f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
189715091d37SBarry Smith       v += 36;
189815091d37SBarry Smith     }
189915091d37SBarry Smith     idc = 6*(*c--);
190015091d37SBarry Smith     v   = aa + 36*diag[i];
1901f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1902f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1903f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1904f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1905f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1906f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1907f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1908f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1909f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1910f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1911f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1912f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
191315091d37SBarry Smith   }
191415091d37SBarry Smith 
191515091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
191615091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1917d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19181ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1919dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
192015091d37SBarry Smith   PetscFunctionReturn(0);
192115091d37SBarry Smith }
192215091d37SBarry Smith 
1923*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
19244a2ae208SSatish Balay #undef __FUNCT__
19258f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
19268f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
19278f690400SShri Abhyankar {
19288f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
19298f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
19308f690400SShri Abhyankar   PetscErrorCode    ierr;
19318f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
193229b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
19338f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
19348f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
19358f690400SShri Abhyankar   const PetscScalar *b;
19368f690400SShri Abhyankar   PetscFunctionBegin;
19378f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19388f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
19398f690400SShri Abhyankar   t  = a->solve_work;
19408f690400SShri Abhyankar 
19418f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
194229b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
19438f690400SShri Abhyankar 
19448f690400SShri Abhyankar   /* forward solve the lower triangular */
194529b92fc1SShri Abhyankar   idx    = 6*r[0];
19468f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
19478f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
19488f690400SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
19498f690400SShri Abhyankar   for (i=1; i<n; i++) {
19508f690400SShri Abhyankar     v     = aa + 36*ai[i];
19518f690400SShri Abhyankar     vi    = aj + ai[i];
19528f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
195329b92fc1SShri Abhyankar     idx   = 6*r[i];
19548f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
19558f690400SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
195629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
195729b92fc1SShri Abhyankar       idx   = 6*vi[m];
19588f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
19598f690400SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
19608f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
19618f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
19628f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
19638f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
19648f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
19658f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
19668f690400SShri Abhyankar       v += 36;
19678f690400SShri Abhyankar     }
19688f690400SShri Abhyankar     idx = 6*i;
19698f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
19708f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
19718f690400SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
19728f690400SShri Abhyankar   }
19738f690400SShri Abhyankar   /* backward solve the upper triangular */
19748f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
19758f690400SShri Abhyankar     k    = 2*n-i;
19768f690400SShri Abhyankar     v    = aa + 36*ai[k];
19778f690400SShri Abhyankar     vi   = aj + ai[k];
19788f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
19798f690400SShri Abhyankar     idt  = 6*i;
19808f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
19818f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
19828f690400SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
198329b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
198429b92fc1SShri Abhyankar       idx   = 6*vi[m];
19858f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
19868f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
19878f690400SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
19888f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
19898f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
19908f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
19918f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
19928f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
19938f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
19948f690400SShri Abhyankar       v += 36;
19958f690400SShri Abhyankar     }
199629b92fc1SShri Abhyankar     idc = 6*c[i];
19978f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
19988f690400SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
19998f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
20008f690400SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
20018f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
20028f690400SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
20038f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
20048f690400SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
20058f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
20068f690400SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
20078f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
20088f690400SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
20098f690400SShri Abhyankar   }
20108f690400SShri Abhyankar 
20118f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
20128f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20138f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20148f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
20158f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
20168f690400SShri Abhyankar   PetscFunctionReturn(0);
20178f690400SShri Abhyankar }
2018*a2d6a19aSShri Abhyankar #endif
20198f690400SShri Abhyankar 
20206506fda5SShri Abhyankar #undef __FUNCT__
2021*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
2022*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
20236506fda5SShri Abhyankar {
20246506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
20256506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
20266506fda5SShri Abhyankar   PetscErrorCode    ierr;
20276506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
20286506fda5SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
20296506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
20306506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
20316506fda5SShri Abhyankar   const PetscScalar *b;
20326506fda5SShri Abhyankar   PetscFunctionBegin;
20336506fda5SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20346506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
20356506fda5SShri Abhyankar   t  = a->solve_work;
20366506fda5SShri Abhyankar 
20376506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
20386506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
20396506fda5SShri Abhyankar 
20406506fda5SShri Abhyankar   /* forward solve the lower triangular */
20416506fda5SShri Abhyankar   idx    = 6*r[0];
20426506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
20436506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
20446506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
20456506fda5SShri Abhyankar   for (i=1; i<n; i++) {
20466506fda5SShri Abhyankar     v     = aa + 36*ai[i];
20476506fda5SShri Abhyankar     vi    = aj + ai[i];
20486506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
20496506fda5SShri Abhyankar     idx   = 6*r[i];
20506506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
20516506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
20526506fda5SShri Abhyankar     for(m=0;m<nz;m++){
20536506fda5SShri Abhyankar       idx   = 6*vi[m];
20546506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
20556506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
20566506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
20576506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
20586506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
20596506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
20606506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
20616506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
20626506fda5SShri Abhyankar       v += 36;
20636506fda5SShri Abhyankar     }
20646506fda5SShri Abhyankar     idx = 6*i;
20656506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
20666506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
20676506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
20686506fda5SShri Abhyankar   }
20696506fda5SShri Abhyankar   /* backward solve the upper triangular */
20706506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
20716506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
20726506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
20736506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
20746506fda5SShri Abhyankar     idt  = 6*i;
20756506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
20766506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
20776506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
20786506fda5SShri Abhyankar     for(m=0;m<nz;m++){
20796506fda5SShri Abhyankar       idx   = 6*vi[m];
20806506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
20816506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
20826506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
20836506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
20846506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
20856506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
20866506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
20876506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
20886506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
20896506fda5SShri Abhyankar       v += 36;
20906506fda5SShri Abhyankar     }
20916506fda5SShri Abhyankar     idc = 6*c[i];
20926506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
20936506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
20946506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
20956506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
20966506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
20976506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
20986506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
20996506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
21006506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
21016506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
21026506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
21036506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
21046506fda5SShri Abhyankar   }
21056506fda5SShri Abhyankar 
21066506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21076506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21086506fda5SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21096506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
21106506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
21116506fda5SShri Abhyankar   PetscFunctionReturn(0);
21126506fda5SShri Abhyankar }
21138f690400SShri Abhyankar 
21148f690400SShri Abhyankar #undef __FUNCT__
21154a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2116dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
211715091d37SBarry Smith {
211815091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2119690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2120dfbe8321SBarry Smith   PetscErrorCode    ierr;
2121690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2122d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2123d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2124d9fead3dSBarry Smith   const PetscScalar *b;
212515091d37SBarry Smith 
212615091d37SBarry Smith   PetscFunctionBegin;
2127d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
212915091d37SBarry Smith   /* forward solve the lower triangular */
213015091d37SBarry Smith   idx    = 0;
213115091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
213215091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
213315091d37SBarry Smith   for (i=1; i<n; i++) {
213415091d37SBarry Smith     v     =  aa + 36*ai[i];
213515091d37SBarry Smith     vi    =  aj + ai[i];
213615091d37SBarry Smith     nz    =  diag[i] - ai[i];
213715091d37SBarry Smith     idx   =  6*i;
2138f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2139f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
214015091d37SBarry Smith     while (nz--) {
214115091d37SBarry Smith       jdx   = 6*(*vi++);
214215091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
214315091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2144f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2145f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2146f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2147f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2148f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2149f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
215015091d37SBarry Smith       v += 36;
215115091d37SBarry Smith      }
2152f1af5d2fSBarry Smith     x[idx]   = s1;
2153f1af5d2fSBarry Smith     x[1+idx] = s2;
2154f1af5d2fSBarry Smith     x[2+idx] = s3;
2155f1af5d2fSBarry Smith     x[3+idx] = s4;
2156f1af5d2fSBarry Smith     x[4+idx] = s5;
2157f1af5d2fSBarry Smith     x[5+idx] = s6;
215815091d37SBarry Smith   }
215915091d37SBarry Smith   /* backward solve the upper triangular */
216015091d37SBarry Smith   for (i=n-1; i>=0; i--){
216115091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
216215091d37SBarry Smith     vi   = aj + diag[i] + 1;
216315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
216415091d37SBarry Smith     idt  = 6*i;
2165f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2166f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2167f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
216815091d37SBarry Smith     while (nz--) {
216915091d37SBarry Smith       idx   = 6*(*vi++);
217015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
217115091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2172f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2173f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2174f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2175f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2176f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2177f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
217815091d37SBarry Smith       v += 36;
217915091d37SBarry Smith     }
218015091d37SBarry Smith     v        = aa + 36*diag[i];
2181f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2182f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2183f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2184f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2185f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2186f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
218715091d37SBarry Smith   }
218815091d37SBarry Smith 
2189d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2191dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
219215091d37SBarry Smith   PetscFunctionReturn(0);
219315091d37SBarry Smith }
219415091d37SBarry Smith 
2195*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
21964a2ae208SSatish Balay #undef __FUNCT__
2197cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2198cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2199cee9d6f2SShri Abhyankar {
2200cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
22016464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2202cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
2203cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
2204cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2205cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2206cee9d6f2SShri Abhyankar     PetscScalar       *x;
2207cee9d6f2SShri Abhyankar     const PetscScalar *b;
2208cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2209cee9d6f2SShri Abhyankar 
2210cee9d6f2SShri Abhyankar     PetscFunctionBegin;
2211cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2212cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2213cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
2214cee9d6f2SShri Abhyankar     idx    = 0;
2215cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2216cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
2217cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
2218cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
2219cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
2220cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
2221cee9d6f2SShri Abhyankar       idx   = bs*i;
2222cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2223cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
22246464896eSShri Abhyankar        for(k=0;k<nz;k++){
22256464896eSShri Abhyankar           jdx   = bs*vi[k];
2226cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2227cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2228cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2229cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2230cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2231cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2232cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2233cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2234cee9d6f2SShri Abhyankar           v   +=  bs2;
2235cee9d6f2SShri Abhyankar         }
2236cee9d6f2SShri Abhyankar 
2237cee9d6f2SShri Abhyankar        x[idx]   = s1;
2238cee9d6f2SShri Abhyankar        x[1+idx] = s2;
2239cee9d6f2SShri Abhyankar        x[2+idx] = s3;
2240cee9d6f2SShri Abhyankar        x[3+idx] = s4;
2241cee9d6f2SShri Abhyankar        x[4+idx] = s5;
2242cee9d6f2SShri Abhyankar        x[5+idx] = s6;
2243cee9d6f2SShri Abhyankar     }
2244cee9d6f2SShri Abhyankar 
2245cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
2246cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2247cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
2248cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
2249cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2250cee9d6f2SShri Abhyankar      idt = bs*i;
2251cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2252cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
22536464896eSShri Abhyankar      for(k=0;k<nz;k++){
22546464896eSShri Abhyankar       idx   = bs*vi[k];
2255cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2256cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
2257cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2258cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2259cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2260cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2261cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2262cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2263cee9d6f2SShri Abhyankar         v   +=  bs2;
2264cee9d6f2SShri Abhyankar     }
2265cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2266cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2267cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2268cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2269cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2270cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2271cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2272cee9d6f2SShri Abhyankar   }
2273cee9d6f2SShri Abhyankar 
2274cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2275cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2276cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2277cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2278cee9d6f2SShri Abhyankar }
2279*a2d6a19aSShri Abhyankar #endif
22808f690400SShri Abhyankar 
2281cee9d6f2SShri Abhyankar #undef __FUNCT__
2282*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2283*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
228453cca76cSShri Abhyankar {
228553cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
228653cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
228753cca76cSShri Abhyankar     PetscErrorCode    ierr;
228853cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
228953cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
229053cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
229153cca76cSShri Abhyankar     PetscScalar       *x;
229253cca76cSShri Abhyankar     const PetscScalar *b;
229353cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
229453cca76cSShri Abhyankar 
229553cca76cSShri Abhyankar     PetscFunctionBegin;
229653cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
229753cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
229853cca76cSShri Abhyankar     /* forward solve the lower triangular */
229953cca76cSShri Abhyankar     idx    = 0;
230053cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
230153cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
230253cca76cSShri Abhyankar     for (i=1; i<n; i++) {
230353cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
230453cca76cSShri Abhyankar        vi   = aj + ai[i];
230553cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
230653cca76cSShri Abhyankar       idx   = bs*i;
230753cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
230853cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
230953cca76cSShri Abhyankar        for(k=0;k<nz;k++){
231053cca76cSShri Abhyankar           jdx   = bs*vi[k];
231153cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
231253cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
231353cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
231453cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
231553cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
231653cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
231753cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
231853cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
231953cca76cSShri Abhyankar           v   +=  bs2;
232053cca76cSShri Abhyankar         }
232153cca76cSShri Abhyankar 
232253cca76cSShri Abhyankar        x[idx]   = s1;
232353cca76cSShri Abhyankar        x[1+idx] = s2;
232453cca76cSShri Abhyankar        x[2+idx] = s3;
232553cca76cSShri Abhyankar        x[3+idx] = s4;
232653cca76cSShri Abhyankar        x[4+idx] = s5;
232753cca76cSShri Abhyankar        x[5+idx] = s6;
232853cca76cSShri Abhyankar     }
232953cca76cSShri Abhyankar 
233053cca76cSShri Abhyankar    /* backward solve the upper triangular */
233153cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
233253cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
233353cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
233453cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
233553cca76cSShri Abhyankar      idt = bs*i;
233653cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
233753cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
233853cca76cSShri Abhyankar      for(k=0;k<nz;k++){
233953cca76cSShri Abhyankar       idx   = bs*vi[k];
234053cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
234153cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
234253cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
234353cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
234453cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
234553cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
234653cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
234753cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
234853cca76cSShri Abhyankar         v   +=  bs2;
234953cca76cSShri Abhyankar     }
235053cca76cSShri Abhyankar     /* x = inv_diagonal*x */
235153cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
235253cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
235353cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
235453cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
235553cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
235653cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
235753cca76cSShri Abhyankar   }
235853cca76cSShri Abhyankar 
235953cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
236053cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
236153cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
236253cca76cSShri Abhyankar   PetscFunctionReturn(0);
236353cca76cSShri Abhyankar }
236453cca76cSShri Abhyankar 
236553cca76cSShri Abhyankar #undef __FUNCT__
23664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2367dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
23684e2b4712SSatish Balay {
23694e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
23704e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
23716849ba73SBarry Smith   PetscErrorCode    ierr;
23725d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
23735d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2374d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2375d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2376d9fead3dSBarry Smith   const PetscScalar *b;
23774e2b4712SSatish Balay 
23784e2b4712SSatish Balay   PetscFunctionBegin;
2379d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2381f1af5d2fSBarry Smith   t  = a->solve_work;
23824e2b4712SSatish Balay 
23834e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
23844e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
23854e2b4712SSatish Balay 
23864e2b4712SSatish Balay   /* forward solve the lower triangular */
23874e2b4712SSatish Balay   idx    = 5*(*r++);
2388f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2389f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
23904e2b4712SSatish Balay   for (i=1; i<n; i++) {
23914e2b4712SSatish Balay     v     = aa + 25*ai[i];
23924e2b4712SSatish Balay     vi    = aj + ai[i];
23934e2b4712SSatish Balay     nz    = diag[i] - ai[i];
23944e2b4712SSatish Balay     idx   = 5*(*r++);
2395f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2396f1af5d2fSBarry Smith     s5  = b[4+idx];
23974e2b4712SSatish Balay     while (nz--) {
23984e2b4712SSatish Balay       idx   = 5*(*vi++);
2399f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2400f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2401f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2402f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2403f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2404f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2405f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
24064e2b4712SSatish Balay       v += 25;
24074e2b4712SSatish Balay     }
24084e2b4712SSatish Balay     idx = 5*i;
2409f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2410f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
24114e2b4712SSatish Balay   }
24124e2b4712SSatish Balay   /* backward solve the upper triangular */
24134e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
24144e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
24154e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
24164e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
24174e2b4712SSatish Balay     idt  = 5*i;
2418f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2419f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
24204e2b4712SSatish Balay     while (nz--) {
24214e2b4712SSatish Balay       idx   = 5*(*vi++);
2422f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2423f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2424f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2425f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2426f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2427f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2428f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
24294e2b4712SSatish Balay       v += 25;
24304e2b4712SSatish Balay     }
24314e2b4712SSatish Balay     idc = 5*(*c--);
24324e2b4712SSatish Balay     v   = aa + 25*diag[i];
2433f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2434f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
2435f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2436f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
2437f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2438f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
2439f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2440f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
2441f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2442f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
24434e2b4712SSatish Balay   }
24444e2b4712SSatish Balay 
24454e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
24464e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2447d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24481ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2449dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
24504e2b4712SSatish Balay   PetscFunctionReturn(0);
24514e2b4712SSatish Balay }
24524e2b4712SSatish Balay 
2453*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
24544a2ae208SSatish Balay #undef __FUNCT__
24558f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
24568f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
24578f690400SShri Abhyankar {
24588f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
24598f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
24608f690400SShri Abhyankar   PetscErrorCode    ierr;
24618f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
246229b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
24638f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
24648f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
24658f690400SShri Abhyankar   const PetscScalar *b;
24668f690400SShri Abhyankar 
24678f690400SShri Abhyankar   PetscFunctionBegin;
24688f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24698f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24708f690400SShri Abhyankar   t  = a->solve_work;
24718f690400SShri Abhyankar 
24728f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
247329b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
24748f690400SShri Abhyankar 
24758f690400SShri Abhyankar   /* forward solve the lower triangular */
247629b92fc1SShri Abhyankar   idx    = 5*r[0];
24778f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
24788f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
24798f690400SShri Abhyankar   for (i=1; i<n; i++) {
24808f690400SShri Abhyankar     v     = aa + 25*ai[i];
24818f690400SShri Abhyankar     vi    = aj + ai[i];
24828f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
248329b92fc1SShri Abhyankar     idx   = 5*r[i];
24848f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
24858f690400SShri Abhyankar     s5  = b[4+idx];
248629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
248729b92fc1SShri Abhyankar       idx   = 5*vi[m];
24888f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
24898f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
24908f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
24918f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
24928f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
24938f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
24948f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
24958f690400SShri Abhyankar       v += 25;
24968f690400SShri Abhyankar     }
24978f690400SShri Abhyankar     idx = 5*i;
24988f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
24998f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
25008f690400SShri Abhyankar   }
25018f690400SShri Abhyankar   /* backward solve the upper triangular */
25028f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
25038f690400SShri Abhyankar     k    = 2*n-i;
25048f690400SShri Abhyankar     v    = aa + 25*ai[k];
25058f690400SShri Abhyankar     vi   = aj + ai[k];
25068f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
25078f690400SShri Abhyankar     idt  = 5*i;
25088f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
25098f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
251029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
251129b92fc1SShri Abhyankar       idx   = 5*vi[m];
25128f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
25138f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
25148f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
25158f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
25168f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
25178f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
25188f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
25198f690400SShri Abhyankar       v += 25;
25208f690400SShri Abhyankar     }
252129b92fc1SShri Abhyankar     idc = 5*c[i];
25228f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
25238f690400SShri Abhyankar                                  v[15]*s4+v[20]*s5;
25248f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
25258f690400SShri Abhyankar                                  v[16]*s4+v[21]*s5;
25268f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
25278f690400SShri Abhyankar                                  v[17]*s4+v[22]*s5;
25288f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
25298f690400SShri Abhyankar                                  v[18]*s4+v[23]*s5;
25308f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
25318f690400SShri Abhyankar                                  v[19]*s4+v[24]*s5;
25328f690400SShri Abhyankar   }
25338f690400SShri Abhyankar 
25348f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
25358f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
25368f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25378f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
25388f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
25398f690400SShri Abhyankar   PetscFunctionReturn(0);
25408f690400SShri Abhyankar }
2541*a2d6a19aSShri Abhyankar #endif
254278bb4007SShri Abhyankar 
254378bb4007SShri Abhyankar #undef __FUNCT__
2544*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2545*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
254678bb4007SShri Abhyankar {
254778bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
254878bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
254978bb4007SShri Abhyankar   PetscErrorCode    ierr;
255078bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
255178bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
255278bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
255378bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
255478bb4007SShri Abhyankar   const PetscScalar *b;
255578bb4007SShri Abhyankar 
255678bb4007SShri Abhyankar   PetscFunctionBegin;
255778bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
255878bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255978bb4007SShri Abhyankar   t  = a->solve_work;
256078bb4007SShri Abhyankar 
256178bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
256278bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
256378bb4007SShri Abhyankar 
256478bb4007SShri Abhyankar   /* forward solve the lower triangular */
256578bb4007SShri Abhyankar   idx    = 5*r[0];
256678bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
256778bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
256878bb4007SShri Abhyankar   for (i=1; i<n; i++) {
256978bb4007SShri Abhyankar     v     = aa + 25*ai[i];
257078bb4007SShri Abhyankar     vi    = aj + ai[i];
257178bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
257278bb4007SShri Abhyankar     idx   = 5*r[i];
257378bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
257478bb4007SShri Abhyankar     s5  = b[4+idx];
257578bb4007SShri Abhyankar     for(m=0;m<nz;m++){
257678bb4007SShri Abhyankar       idx   = 5*vi[m];
257778bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
257878bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
257978bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
258078bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
258178bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
258278bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
258378bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
258478bb4007SShri Abhyankar       v += 25;
258578bb4007SShri Abhyankar     }
258678bb4007SShri Abhyankar     idx = 5*i;
258778bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
258878bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
258978bb4007SShri Abhyankar   }
259078bb4007SShri Abhyankar   /* backward solve the upper triangular */
259178bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
259278bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
259378bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
259478bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
259578bb4007SShri Abhyankar     idt  = 5*i;
259678bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
259778bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
259878bb4007SShri Abhyankar     for(m=0;m<nz;m++){
259978bb4007SShri Abhyankar       idx   = 5*vi[m];
260078bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
260178bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
260278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
260378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
260478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
260578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
260678bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
260778bb4007SShri Abhyankar       v += 25;
260878bb4007SShri Abhyankar     }
260978bb4007SShri Abhyankar     idc = 5*c[i];
261078bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
261178bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
261278bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
261378bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
261478bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
261578bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
261678bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
261778bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
261878bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
261978bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
262078bb4007SShri Abhyankar   }
262178bb4007SShri Abhyankar 
262278bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
262378bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
262478bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
262578bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
262678bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
262778bb4007SShri Abhyankar   PetscFunctionReturn(0);
262878bb4007SShri Abhyankar }
262978bb4007SShri Abhyankar 
26308f690400SShri Abhyankar #undef __FUNCT__
26314a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2632dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
263315091d37SBarry Smith {
263415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2635690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2636dfbe8321SBarry Smith   PetscErrorCode    ierr;
2637690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2638d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2639d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2640d9fead3dSBarry Smith   const PetscScalar *b;
264115091d37SBarry Smith 
264215091d37SBarry Smith   PetscFunctionBegin;
2643d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26441ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
264515091d37SBarry Smith   /* forward solve the lower triangular */
264615091d37SBarry Smith   idx    = 0;
264715091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
264815091d37SBarry Smith   for (i=1; i<n; i++) {
264915091d37SBarry Smith     v     =  aa + 25*ai[i];
265015091d37SBarry Smith     vi    =  aj + ai[i];
265115091d37SBarry Smith     nz    =  diag[i] - ai[i];
265215091d37SBarry Smith     idx   =  5*i;
2653f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
265415091d37SBarry Smith     while (nz--) {
265515091d37SBarry Smith       jdx   = 5*(*vi++);
265615091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2657f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2658f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2659f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2660f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2661f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
266215091d37SBarry Smith       v    += 25;
266315091d37SBarry Smith     }
2664f1af5d2fSBarry Smith     x[idx]   = s1;
2665f1af5d2fSBarry Smith     x[1+idx] = s2;
2666f1af5d2fSBarry Smith     x[2+idx] = s3;
2667f1af5d2fSBarry Smith     x[3+idx] = s4;
2668f1af5d2fSBarry Smith     x[4+idx] = s5;
266915091d37SBarry Smith   }
267015091d37SBarry Smith   /* backward solve the upper triangular */
267115091d37SBarry Smith   for (i=n-1; i>=0; i--){
267215091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
267315091d37SBarry Smith     vi   = aj + diag[i] + 1;
267415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
267515091d37SBarry Smith     idt  = 5*i;
2676f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2677f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
267815091d37SBarry Smith     while (nz--) {
267915091d37SBarry Smith       idx   = 5*(*vi++);
268015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2681f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2682f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2683f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2684f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2685f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
268615091d37SBarry Smith       v    += 25;
268715091d37SBarry Smith     }
268815091d37SBarry Smith     v        = aa + 25*diag[i];
2689f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2690f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2691f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2692f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2693f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
269415091d37SBarry Smith   }
269515091d37SBarry Smith 
2696d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
269915091d37SBarry Smith   PetscFunctionReturn(0);
270015091d37SBarry Smith }
270115091d37SBarry Smith 
2702*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
27034a2ae208SSatish Balay #undef __FUNCT__
2704cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2705cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2706cee9d6f2SShri Abhyankar {
2707cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
27086464896eSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2709cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
2710cee9d6f2SShri Abhyankar   PetscInt          jdx;
2711cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2712cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2713cee9d6f2SShri Abhyankar   const PetscScalar *b;
2714cee9d6f2SShri Abhyankar 
2715cee9d6f2SShri Abhyankar   PetscFunctionBegin;
2716cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2717cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2718cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
2719cee9d6f2SShri Abhyankar   idx    = 0;
2720cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2721cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
2722cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
2723cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
2724cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
2725cee9d6f2SShri Abhyankar     idx = 5*i;
2726cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
27276464896eSShri Abhyankar     for(k=0;k<nz;k++) {
27286464896eSShri Abhyankar       jdx   = 5*vi[k];
2729cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2730cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2731cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2732cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2733cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2734cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2735cee9d6f2SShri Abhyankar       v    += 25;
2736cee9d6f2SShri Abhyankar     }
2737cee9d6f2SShri Abhyankar     x[idx]   = s1;
2738cee9d6f2SShri Abhyankar     x[1+idx] = s2;
2739cee9d6f2SShri Abhyankar     x[2+idx] = s3;
2740cee9d6f2SShri Abhyankar     x[3+idx] = s4;
2741cee9d6f2SShri Abhyankar     x[4+idx] = s5;
2742cee9d6f2SShri Abhyankar   }
2743cee9d6f2SShri Abhyankar 
2744cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
2745cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2746cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
2747cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
2748cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2749cee9d6f2SShri Abhyankar     idt = 5*i;
2750cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
2751cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
27526464896eSShri Abhyankar     for(k=0;k<nz;k++){
27536464896eSShri Abhyankar       idx   = 5*vi[k];
2754cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2755cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2756cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2757cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2758cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2759cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2760cee9d6f2SShri Abhyankar       v    += 25;
2761cee9d6f2SShri Abhyankar     }
2762cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2763cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2764cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2765cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2766cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2767cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2768cee9d6f2SShri Abhyankar   }
2769cee9d6f2SShri Abhyankar 
2770cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2771cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2772cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2773cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2774cee9d6f2SShri Abhyankar }
2775*a2d6a19aSShri Abhyankar #endif
2776cee9d6f2SShri Abhyankar 
2777cee9d6f2SShri Abhyankar #undef __FUNCT__
2778*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2779*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
278053cca76cSShri Abhyankar {
278153cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
278253cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
278353cca76cSShri Abhyankar   PetscErrorCode    ierr;
278453cca76cSShri Abhyankar   PetscInt          jdx;
278553cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
278653cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
278753cca76cSShri Abhyankar   const PetscScalar *b;
278853cca76cSShri Abhyankar 
278953cca76cSShri Abhyankar   PetscFunctionBegin;
279053cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
279153cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
279253cca76cSShri Abhyankar   /* forward solve the lower triangular */
279353cca76cSShri Abhyankar   idx    = 0;
279453cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
279553cca76cSShri Abhyankar   for (i=1; i<n; i++) {
279653cca76cSShri Abhyankar     v   = aa + 25*ai[i];
279753cca76cSShri Abhyankar     vi  = aj + ai[i];
279853cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
279953cca76cSShri Abhyankar     idx = 5*i;
280053cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
280153cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
280253cca76cSShri Abhyankar       jdx   = 5*vi[k];
280353cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
280453cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
280553cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
280653cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
280753cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
280853cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
280953cca76cSShri Abhyankar       v    += 25;
281053cca76cSShri Abhyankar     }
281153cca76cSShri Abhyankar     x[idx]   = s1;
281253cca76cSShri Abhyankar     x[1+idx] = s2;
281353cca76cSShri Abhyankar     x[2+idx] = s3;
281453cca76cSShri Abhyankar     x[3+idx] = s4;
281553cca76cSShri Abhyankar     x[4+idx] = s5;
281653cca76cSShri Abhyankar   }
281753cca76cSShri Abhyankar 
281853cca76cSShri Abhyankar   /* backward solve the upper triangular */
281953cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
282053cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
282153cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
282253cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
282353cca76cSShri Abhyankar     idt = 5*i;
282453cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
282553cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
282653cca76cSShri Abhyankar     for(k=0;k<nz;k++){
282753cca76cSShri Abhyankar       idx   = 5*vi[k];
282853cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
282953cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
283053cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
283153cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
283253cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
283353cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
283453cca76cSShri Abhyankar       v    += 25;
283553cca76cSShri Abhyankar     }
283653cca76cSShri Abhyankar     /* x = inv_diagonal*x */
283753cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
283853cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
283953cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
284053cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
284153cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
284253cca76cSShri Abhyankar   }
284353cca76cSShri Abhyankar 
284453cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
284553cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
284653cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
284753cca76cSShri Abhyankar   PetscFunctionReturn(0);
284853cca76cSShri Abhyankar }
284953cca76cSShri Abhyankar 
285053cca76cSShri Abhyankar #undef __FUNCT__
28514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2852dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
28534e2b4712SSatish Balay {
28544e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
28554e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
28566849ba73SBarry Smith   PetscErrorCode    ierr;
28575d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
28585d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2859d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2860d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2861d9fead3dSBarry Smith   const PetscScalar *b;
28624e2b4712SSatish Balay 
28634e2b4712SSatish Balay   PetscFunctionBegin;
2864d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28651ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2866f1af5d2fSBarry Smith   t  = a->solve_work;
28674e2b4712SSatish Balay 
28684e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28694e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28704e2b4712SSatish Balay 
28714e2b4712SSatish Balay   /* forward solve the lower triangular */
28724e2b4712SSatish Balay   idx    = 4*(*r++);
2873f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2874f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
28754e2b4712SSatish Balay   for (i=1; i<n; i++) {
28764e2b4712SSatish Balay     v     = aa + 16*ai[i];
28774e2b4712SSatish Balay     vi    = aj + ai[i];
28784e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28794e2b4712SSatish Balay     idx   = 4*(*r++);
2880f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
28814e2b4712SSatish Balay     while (nz--) {
28824e2b4712SSatish Balay       idx   = 4*(*vi++);
2883f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2884f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2885f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2886f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2887f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
28884e2b4712SSatish Balay       v    += 16;
28894e2b4712SSatish Balay     }
28904e2b4712SSatish Balay     idx        = 4*i;
2891f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2892f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
28934e2b4712SSatish Balay   }
28944e2b4712SSatish Balay   /* backward solve the upper triangular */
28954e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28964e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
28974e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28984e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28994e2b4712SSatish Balay     idt  = 4*i;
2900f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2901f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
29024e2b4712SSatish Balay     while (nz--) {
29034e2b4712SSatish Balay       idx   = 4*(*vi++);
2904f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2905f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2906f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2907f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2908f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2909f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
29104e2b4712SSatish Balay       v += 16;
29114e2b4712SSatish Balay     }
29124e2b4712SSatish Balay     idc      = 4*(*c--);
29134e2b4712SSatish Balay     v        = aa + 16*diag[i];
2914f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2915f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2916f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2917f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
29184e2b4712SSatish Balay   }
29194e2b4712SSatish Balay 
29204e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
29214e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2922d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29231ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2924dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
29254e2b4712SSatish Balay   PetscFunctionReturn(0);
29264e2b4712SSatish Balay }
2927f26ec98cSKris Buschelman 
2928*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
2929f26ec98cSKris Buschelman #undef __FUNCT__
29308f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
29318f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
29328f690400SShri Abhyankar {
29338f690400SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
29348f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
29358f690400SShri Abhyankar   PetscErrorCode    ierr;
293629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
29378f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
29388f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
29398f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
29408f690400SShri Abhyankar   const PetscScalar *b;
29418f690400SShri Abhyankar 
29428f690400SShri Abhyankar   PetscFunctionBegin;
29438f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29448f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
29458f690400SShri Abhyankar   t  = a->solve_work;
29468f690400SShri Abhyankar 
29478f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
294829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
29498f690400SShri Abhyankar 
29508f690400SShri Abhyankar   /* forward solve the lower triangular */
295129b92fc1SShri Abhyankar   idx    = 4*r[0];
29528f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
29538f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
29548f690400SShri Abhyankar   for (i=1; i<n; i++) {
29558f690400SShri Abhyankar     v     = aa + 16*ai[i];
29568f690400SShri Abhyankar     vi    = aj + ai[i];
29578f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
295829b92fc1SShri Abhyankar     idx   = 4*r[i];
29598f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
296029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
296129b92fc1SShri Abhyankar       idx   = 4*vi[m];
29628f690400SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
29638f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
29648f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
29658f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
29668f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
29678f690400SShri Abhyankar       v    += 16;
29688f690400SShri Abhyankar     }
29698f690400SShri Abhyankar     idx        = 4*i;
29708f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
29718f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
29728f690400SShri Abhyankar   }
29738f690400SShri Abhyankar   /* backward solve the upper triangular */
29748f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
29758f690400SShri Abhyankar     k    = 2*n-i;
29768f690400SShri Abhyankar     v    = aa + 16*ai[k];
29778f690400SShri Abhyankar     vi   = aj + ai[k];
29788f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
29798f690400SShri Abhyankar     idt  = 4*i;
29808f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
29818f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
298229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
298329b92fc1SShri Abhyankar       idx   = 4*vi[m];
29848f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
29858f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
29868f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
29878f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
29888f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
29898f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
29908f690400SShri Abhyankar       v += 16;
29918f690400SShri Abhyankar     }
299229b92fc1SShri Abhyankar     idc      = 4*c[i];
29938f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
29948f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
29958f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
29968f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
29978f690400SShri Abhyankar   }
29988f690400SShri Abhyankar 
29998f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30008f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30018f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30028f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
30038f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
30048f690400SShri Abhyankar   PetscFunctionReturn(0);
30058f690400SShri Abhyankar }
3006*a2d6a19aSShri Abhyankar #endif
30078f690400SShri Abhyankar 
30088f690400SShri Abhyankar #undef __FUNCT__
3009*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
3010*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
301178bb4007SShri Abhyankar {
301278bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
301378bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
301478bb4007SShri Abhyankar   PetscErrorCode    ierr;
301578bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
301678bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
301778bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
301878bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
301978bb4007SShri Abhyankar   const PetscScalar *b;
302078bb4007SShri Abhyankar 
302178bb4007SShri Abhyankar   PetscFunctionBegin;
302278bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
302378bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
302478bb4007SShri Abhyankar   t  = a->solve_work;
302578bb4007SShri Abhyankar 
302678bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
302778bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
302878bb4007SShri Abhyankar 
302978bb4007SShri Abhyankar   /* forward solve the lower triangular */
303078bb4007SShri Abhyankar   idx    = 4*r[0];
303178bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
303278bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
303378bb4007SShri Abhyankar   for (i=1; i<n; i++) {
303478bb4007SShri Abhyankar     v     = aa + 16*ai[i];
303578bb4007SShri Abhyankar     vi    = aj + ai[i];
303678bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
303778bb4007SShri Abhyankar     idx   = 4*r[i];
303878bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
303978bb4007SShri Abhyankar     for(m=0;m<nz;m++){
304078bb4007SShri Abhyankar       idx   = 4*vi[m];
304178bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
304278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
304378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
304478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
304578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
304678bb4007SShri Abhyankar       v    += 16;
304778bb4007SShri Abhyankar     }
304878bb4007SShri Abhyankar     idx        = 4*i;
304978bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
305078bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
305178bb4007SShri Abhyankar   }
305278bb4007SShri Abhyankar   /* backward solve the upper triangular */
305378bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
305478bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
305578bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
305678bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
305778bb4007SShri Abhyankar     idt  = 4*i;
305878bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
305978bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
306078bb4007SShri Abhyankar     for(m=0;m<nz;m++){
306178bb4007SShri Abhyankar       idx   = 4*vi[m];
306278bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
306378bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
306478bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
306578bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
306678bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
306778bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
306878bb4007SShri Abhyankar       v += 16;
306978bb4007SShri Abhyankar     }
307078bb4007SShri Abhyankar     idc      = 4*c[i];
307178bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
307278bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
307378bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
307478bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
307578bb4007SShri Abhyankar   }
307678bb4007SShri Abhyankar 
307778bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
307878bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
307978bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
308078bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308178bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
308278bb4007SShri Abhyankar   PetscFunctionReturn(0);
308378bb4007SShri Abhyankar }
308478bb4007SShri Abhyankar 
308578bb4007SShri Abhyankar #undef __FUNCT__
3086f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3087dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3088f26ec98cSKris Buschelman {
3089f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3090f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
30916849ba73SBarry Smith   PetscErrorCode    ierr;
30925d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
30935d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3094d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3095d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3096d9fead3dSBarry Smith   PetscScalar       *x;
3097d9fead3dSBarry Smith   const PetscScalar *b;
3098f26ec98cSKris Buschelman 
3099f26ec98cSKris Buschelman   PetscFunctionBegin;
3100d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3102f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3103f26ec98cSKris Buschelman 
3104f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3105f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3106f26ec98cSKris Buschelman 
3107f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3108f26ec98cSKris Buschelman   idx    = 4*(*r++);
3109f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3110f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3111f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3112f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3113f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3114f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3115f26ec98cSKris Buschelman     vi    = aj + ai[i];
3116f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3117f26ec98cSKris Buschelman     idx   = 4*(*r++);
3118f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3119f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3120f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3121f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3122f26ec98cSKris Buschelman     while (nz--) {
3123f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3124f26ec98cSKris Buschelman       x1  = t[idx];
3125f26ec98cSKris Buschelman       x2  = t[1+idx];
3126f26ec98cSKris Buschelman       x3  = t[2+idx];
3127f26ec98cSKris Buschelman       x4  = t[3+idx];
3128f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3129f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3130f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3131f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3132f26ec98cSKris Buschelman       v    += 16;
3133f26ec98cSKris Buschelman     }
3134f26ec98cSKris Buschelman     idx        = 4*i;
3135f26ec98cSKris Buschelman     t[idx]   = s1;
3136f26ec98cSKris Buschelman     t[1+idx] = s2;
3137f26ec98cSKris Buschelman     t[2+idx] = s3;
3138f26ec98cSKris Buschelman     t[3+idx] = s4;
3139f26ec98cSKris Buschelman   }
3140f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3141f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
3142f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3143f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3144f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3145f26ec98cSKris Buschelman     idt  = 4*i;
3146f26ec98cSKris Buschelman     s1 = t[idt];
3147f26ec98cSKris Buschelman     s2 = t[1+idt];
3148f26ec98cSKris Buschelman     s3 = t[2+idt];
3149f26ec98cSKris Buschelman     s4 = t[3+idt];
3150f26ec98cSKris Buschelman     while (nz--) {
3151f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3152f26ec98cSKris Buschelman       x1  = t[idx];
3153f26ec98cSKris Buschelman       x2  = t[1+idx];
3154f26ec98cSKris Buschelman       x3  = t[2+idx];
3155f26ec98cSKris Buschelman       x4  = t[3+idx];
3156f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3157f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3158f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3159f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3160f26ec98cSKris Buschelman       v += 16;
3161f26ec98cSKris Buschelman     }
3162f26ec98cSKris Buschelman     idc      = 4*(*c--);
3163f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3164f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3165f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3166f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3167f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3168f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3169f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3170f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3171f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3172f26ec98cSKris Buschelman  }
3173f26ec98cSKris Buschelman 
3174f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3175f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3176d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3178dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3179f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3180f26ec98cSKris Buschelman }
3181f26ec98cSKris Buschelman 
318224c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
318324c233c2SKris Buschelman 
318424c233c2SKris Buschelman #include PETSC_HAVE_SSE
318524c233c2SKris Buschelman 
318624c233c2SKris Buschelman #undef __FUNCT__
318724c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3188dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
318924c233c2SKris Buschelman {
319024c233c2SKris Buschelman   /*
319124c233c2SKris Buschelman      Note: This code uses demotion of double
319224c233c2SKris Buschelman      to float when performing the mixed-mode computation.
319324c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
319424c233c2SKris Buschelman   */
319524c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
319624c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
31976849ba73SBarry Smith   PetscErrorCode ierr;
31985d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
31995d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
320024c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
320187828ca2SBarry Smith   PetscScalar    *x,*b,*t;
320224c233c2SKris Buschelman 
320324c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
320424c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
320524c233c2SKris Buschelman   unsigned long   offset;
320624c233c2SKris Buschelman 
320724c233c2SKris Buschelman   PetscFunctionBegin;
320824c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
320924c233c2SKris Buschelman 
321024c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
321124c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
321224c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
321324c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
321424c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
321524c233c2SKris Buschelman 
32161ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
32171ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
321824c233c2SKris Buschelman     t  = a->solve_work;
321924c233c2SKris Buschelman 
322024c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
322124c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
322224c233c2SKris Buschelman 
322324c233c2SKris Buschelman     /* forward solve the lower triangular */
322424c233c2SKris Buschelman     idx  = 4*(*r++);
322524c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
322624c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
322724c233c2SKris Buschelman     v    =  aa + 16*ai[1];
322824c233c2SKris Buschelman 
322924c233c2SKris Buschelman     for (i=1; i<n;) {
323024c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
323124c233c2SKris Buschelman       vi   =  aj      + ai[i];
323224c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
323324c233c2SKris Buschelman       idx  =  4*(*r++);
323424c233c2SKris Buschelman 
323524c233c2SKris Buschelman       /* Demote sum from double to float */
323624c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
323724c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
323824c233c2SKris Buschelman 
323924c233c2SKris Buschelman       while (nz--) {
324024c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
324124c233c2SKris Buschelman         idx = 4*(*vi++);
324224c233c2SKris Buschelman 
324324c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
324424c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
324524c233c2SKris Buschelman 
324624c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
324724c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
324824c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
324924c233c2SKris Buschelman 
325024c233c2SKris Buschelman           /* First Column */
325124c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
325224c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
325324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
325424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
325524c233c2SKris Buschelman 
325624c233c2SKris Buschelman           /* Second Column */
325724c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
325824c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
325924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
326024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
326124c233c2SKris Buschelman 
326224c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
326324c233c2SKris Buschelman 
326424c233c2SKris Buschelman           /* Third Column */
326524c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
326624c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
326724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
326824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
326924c233c2SKris Buschelman 
327024c233c2SKris Buschelman           /* Fourth Column */
327124c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
327224c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
327324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
327424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
327524c233c2SKris Buschelman         SSE_INLINE_END_2
327624c233c2SKris Buschelman 
327724c233c2SKris Buschelman         v  += 16;
327824c233c2SKris Buschelman       }
327924c233c2SKris Buschelman       idx = 4*i;
328024c233c2SKris Buschelman       v   = aa + 16*ai[++i];
328124c233c2SKris Buschelman       PREFETCH_NTA(v);
328224c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
328324c233c2SKris Buschelman 
328424c233c2SKris Buschelman       /* Promote result from float to double */
328524c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
328624c233c2SKris Buschelman     }
328724c233c2SKris Buschelman     /* backward solve the upper triangular */
328824c233c2SKris Buschelman     idt  = 4*(n-1);
328924c233c2SKris Buschelman     ai16 = 16*diag[n-1];
329024c233c2SKris Buschelman     v    = aa + ai16 + 16;
329124c233c2SKris Buschelman     for (i=n-1; i>=0;){
329224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
329324c233c2SKris Buschelman       vi = aj + diag[i] + 1;
329424c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
329524c233c2SKris Buschelman 
329624c233c2SKris Buschelman       /* Demote accumulator from double to float */
329724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
329824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
329924c233c2SKris Buschelman 
330024c233c2SKris Buschelman       while (nz--) {
330124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
330224c233c2SKris Buschelman         idx = 4*(*vi++);
330324c233c2SKris Buschelman 
330424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
330524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
330624c233c2SKris Buschelman 
330724c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
330824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
330924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
331024c233c2SKris Buschelman 
331124c233c2SKris Buschelman           /* First Column */
331224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
331324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
331424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
331524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
331624c233c2SKris Buschelman 
331724c233c2SKris Buschelman           /* Second Column */
331824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
331924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
332024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
332124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
332224c233c2SKris Buschelman 
332324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
332424c233c2SKris Buschelman 
332524c233c2SKris Buschelman           /* Third Column */
332624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
332724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
332824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
332924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
333024c233c2SKris Buschelman 
333124c233c2SKris Buschelman           /* Fourth Column */
333224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
333324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
333424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
333524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
333624c233c2SKris Buschelman         SSE_INLINE_END_2
333724c233c2SKris Buschelman         v  += 16;
333824c233c2SKris Buschelman       }
333924c233c2SKris Buschelman       v    = aa + ai16;
334024c233c2SKris Buschelman       ai16 = 16*diag[--i];
334124c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
334224c233c2SKris Buschelman       /*
334324c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
334424c233c2SKris Buschelman          which was inverted as part of the factorization
334524c233c2SKris Buschelman       */
334624c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
334724c233c2SKris Buschelman         /* First Column */
334824c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
334924c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
335024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
335124c233c2SKris Buschelman 
335224c233c2SKris Buschelman         /* Second Column */
335324c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
335424c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
335524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
335624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
335724c233c2SKris Buschelman 
335824c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
335924c233c2SKris Buschelman 
336024c233c2SKris Buschelman         /* Third Column */
336124c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
336224c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
336324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
336424c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
336524c233c2SKris Buschelman 
336624c233c2SKris Buschelman         /* Fourth Column */
336724c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
336824c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
336924c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
337024c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
337124c233c2SKris Buschelman 
337224c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
337324c233c2SKris Buschelman       SSE_INLINE_END_3
337424c233c2SKris Buschelman 
337524c233c2SKris Buschelman       /* Promote solution from float to double */
337624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
337724c233c2SKris Buschelman 
337824c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
337924c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
338024c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
338124c233c2SKris Buschelman       idc  = 4*(*c--);
338224c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
338324c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
338424c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
338524c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
338624c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
338724c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
338824c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
338924c233c2SKris Buschelman       SSE_INLINE_END_2
339024c233c2SKris Buschelman       v    = aa + ai16 + 16;
339124c233c2SKris Buschelman       idt -= 4;
339224c233c2SKris Buschelman     }
339324c233c2SKris Buschelman 
339424c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
339524c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
33961ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
33971ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3398dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
339924c233c2SKris Buschelman   SSE_SCOPE_END;
340024c233c2SKris Buschelman   PetscFunctionReturn(0);
340124c233c2SKris Buschelman }
340224c233c2SKris Buschelman 
340324c233c2SKris Buschelman #endif
34040ef38995SBarry Smith 
34050ef38995SBarry Smith 
34064e2b4712SSatish Balay /*
34074e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
34084e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
34094e2b4712SSatish Balay */
34104a2ae208SSatish Balay #undef __FUNCT__
34114a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3412dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
34134e2b4712SSatish Balay {
34144e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3415356650c2SBarry Smith   PetscInt          n=a->mbs;
3416356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
3417dfbe8321SBarry Smith   PetscErrorCode    ierr;
3418356650c2SBarry Smith   const PetscInt    *diag = a->diag;
3419d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
3420d9fead3dSBarry Smith   PetscScalar       *x;
3421d9fead3dSBarry Smith   const PetscScalar *b;
34224e2b4712SSatish Balay 
34234e2b4712SSatish Balay   PetscFunctionBegin;
3424d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
34251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
34264e2b4712SSatish Balay 
3427aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
34282853dc0eSBarry Smith   {
342987828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
34302853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
34312853dc0eSBarry Smith   }
3432aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
34332853dc0eSBarry Smith   {
343487828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
34352853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
34362853dc0eSBarry Smith   }
3437aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
34382853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3439e1293385SBarry Smith #else
344030d4dcafSBarry Smith   {
344187828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3442d9fead3dSBarry Smith     const MatScalar *v;
3443356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
3444356650c2SBarry Smith     const PetscInt  *vi;
3445e1293385SBarry Smith 
34464e2b4712SSatish Balay   /* forward solve the lower triangular */
34474e2b4712SSatish Balay   idx    = 0;
3448e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
34494e2b4712SSatish Balay   for (i=1; i<n; i++) {
34504e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
34514e2b4712SSatish Balay     vi    =  aj      + ai[i];
34524e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3453e1293385SBarry Smith     idx   +=  4;
3454f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
34554e2b4712SSatish Balay     while (nz--) {
34564e2b4712SSatish Balay       jdx   = 4*(*vi++);
34574e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3458f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3459f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3460f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3461f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
34624e2b4712SSatish Balay       v    += 16;
34634e2b4712SSatish Balay     }
3464f1af5d2fSBarry Smith     x[idx]   = s1;
3465f1af5d2fSBarry Smith     x[1+idx] = s2;
3466f1af5d2fSBarry Smith     x[2+idx] = s3;
3467f1af5d2fSBarry Smith     x[3+idx] = s4;
34684e2b4712SSatish Balay   }
34694e2b4712SSatish Balay   /* backward solve the upper triangular */
34704e555682SBarry Smith   idt = 4*(n-1);
34714e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
34724e555682SBarry Smith     ai16 = 16*diag[i];
34734e555682SBarry Smith     v    = aa + ai16 + 16;
34744e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
34754e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3476f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3477f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
34784e2b4712SSatish Balay     while (nz--) {
34794e2b4712SSatish Balay       idx   = 4*(*vi++);
34804e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3481f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3482f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3483f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3484f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
34854e2b4712SSatish Balay       v    += 16;
34864e2b4712SSatish Balay     }
34874e555682SBarry Smith     v        = aa + ai16;
3488f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3489f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3490f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3491f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3492329f5518SBarry Smith     idt -= 4;
34934e2b4712SSatish Balay   }
349430d4dcafSBarry Smith   }
3495e1293385SBarry Smith #endif
34964e2b4712SSatish Balay 
3497d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
34981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3499dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
35004e2b4712SSatish Balay   PetscFunctionReturn(0);
35014e2b4712SSatish Balay }
35024e2b4712SSatish Balay 
3503*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
3504f26ec98cSKris Buschelman #undef __FUNCT__
3505cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3506cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3507cee9d6f2SShri Abhyankar {
3508cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
35096464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3510cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3511cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3512cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3513cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3514cee9d6f2SShri Abhyankar     PetscScalar       *x;
3515cee9d6f2SShri Abhyankar     const PetscScalar *b;
3516cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3517cee9d6f2SShri Abhyankar 
3518cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3519cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3520cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3521cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3522cee9d6f2SShri Abhyankar     idx    = 0;
3523cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3524cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3525cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3526cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3527cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3528cee9d6f2SShri Abhyankar       idx   = bs*i;
3529cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
35306464896eSShri Abhyankar       for(k=0;k<nz;k++) {
35316464896eSShri Abhyankar           jdx   = bs*vi[k];
3532cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3533cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3534cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3535cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3536cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3537cee9d6f2SShri Abhyankar 
3538cee9d6f2SShri Abhyankar           v   +=  bs2;
3539cee9d6f2SShri Abhyankar         }
3540cee9d6f2SShri Abhyankar 
3541cee9d6f2SShri Abhyankar        x[idx]   = s1;
3542cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3543cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3544cee9d6f2SShri Abhyankar        x[3+idx] = s4;
3545cee9d6f2SShri Abhyankar     }
3546cee9d6f2SShri Abhyankar 
3547cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3548cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3549cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3550cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3551cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3552cee9d6f2SShri Abhyankar      idt = bs*i;
3553cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3554cee9d6f2SShri Abhyankar 
35556464896eSShri Abhyankar     for(k=0;k<nz;k++){
35566464896eSShri Abhyankar       idx   = bs*vi[k];
3557cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3558cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3559cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3560cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3561cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3562cee9d6f2SShri Abhyankar 
3563cee9d6f2SShri Abhyankar         v   +=  bs2;
3564cee9d6f2SShri Abhyankar     }
3565cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3566cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3567cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3568cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3569cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3570cee9d6f2SShri Abhyankar 
3571cee9d6f2SShri Abhyankar   }
3572cee9d6f2SShri Abhyankar 
3573cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3574cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3575cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3576cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3577cee9d6f2SShri Abhyankar }
3578*a2d6a19aSShri Abhyankar #endif
3579cee9d6f2SShri Abhyankar 
3580b2b2dd24SShri Abhyankar #undef __FUNCT__
3581*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3582*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3583b2b2dd24SShri Abhyankar {
3584b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3585b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3586b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3587b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3588b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3589b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3590b2b2dd24SShri Abhyankar     PetscScalar       *x;
3591b2b2dd24SShri Abhyankar     const PetscScalar *b;
3592b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3593cee9d6f2SShri Abhyankar 
3594b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3595b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3596b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3597b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3598b2b2dd24SShri Abhyankar     idx    = 0;
3599b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3600b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3601b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3602b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3603b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3604b2b2dd24SShri Abhyankar       idx   = bs*i;
3605b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3606b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
3607b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
3608b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3609b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3610b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3611b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3612b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3613b2b2dd24SShri Abhyankar 
3614b2b2dd24SShri Abhyankar           v   +=  bs2;
3615b2b2dd24SShri Abhyankar         }
3616b2b2dd24SShri Abhyankar 
3617b2b2dd24SShri Abhyankar        x[idx]   = s1;
3618b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3619b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3620b2b2dd24SShri Abhyankar        x[3+idx] = s4;
3621b2b2dd24SShri Abhyankar     }
3622b2b2dd24SShri Abhyankar 
3623b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3624b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3625b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3626b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3627b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3628b2b2dd24SShri Abhyankar      idt = bs*i;
3629b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3630b2b2dd24SShri Abhyankar 
3631b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
3632b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
3633b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3634b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3635b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3636b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3637b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3638b2b2dd24SShri Abhyankar 
3639b2b2dd24SShri Abhyankar         v   +=  bs2;
3640b2b2dd24SShri Abhyankar     }
3641b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3642b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3643b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3644b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3645b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3646b2b2dd24SShri Abhyankar 
3647b2b2dd24SShri Abhyankar   }
3648b2b2dd24SShri Abhyankar 
3649b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3650b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3651b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3652b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3653b2b2dd24SShri Abhyankar }
3654cee9d6f2SShri Abhyankar 
3655cee9d6f2SShri Abhyankar #undef __FUNCT__
3656f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3657dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3658f26ec98cSKris Buschelman {
3659f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3660690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3661dfbe8321SBarry Smith   PetscErrorCode ierr;
3662690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3663f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3664f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3665f26ec98cSKris Buschelman 
3666f26ec98cSKris Buschelman   PetscFunctionBegin;
36671ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
36681ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3669f26ec98cSKris Buschelman 
3670f26ec98cSKris Buschelman   {
3671f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3672f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3673690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3674f26ec98cSKris Buschelman 
3675f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3676f26ec98cSKris Buschelman     idx  = 0;
3677f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3678f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3679f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3680f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3681f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3682f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3683f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3684f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3685f26ec98cSKris Buschelman       idx   +=  4;
3686f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3687f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3688f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3689f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3690f26ec98cSKris Buschelman       while (nz--) {
3691f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3692f26ec98cSKris Buschelman         x1  = t[jdx];
3693f26ec98cSKris Buschelman         x2  = t[1+jdx];
3694f26ec98cSKris Buschelman         x3  = t[2+jdx];
3695f26ec98cSKris Buschelman         x4  = t[3+jdx];
3696f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3697f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3698f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3699f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3700f26ec98cSKris Buschelman         v    += 16;
3701f26ec98cSKris Buschelman       }
3702f26ec98cSKris Buschelman       t[idx]   = s1;
3703f26ec98cSKris Buschelman       t[1+idx] = s2;
3704f26ec98cSKris Buschelman       t[2+idx] = s3;
3705f26ec98cSKris Buschelman       t[3+idx] = s4;
3706f26ec98cSKris Buschelman     }
3707f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3708f26ec98cSKris Buschelman     idt = 4*(n-1);
3709f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3710f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3711f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3712f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3713f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3714f26ec98cSKris Buschelman       s1   = t[idt];
3715f26ec98cSKris Buschelman       s2   = t[1+idt];
3716f26ec98cSKris Buschelman       s3   = t[2+idt];
3717f26ec98cSKris Buschelman       s4   = t[3+idt];
3718f26ec98cSKris Buschelman       while (nz--) {
3719f26ec98cSKris Buschelman         idx = 4*(*vi++);
3720f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3721f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3722f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3723f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3724f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3725f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3726f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3727f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3728f26ec98cSKris Buschelman         v    += 16;
3729f26ec98cSKris Buschelman       }
3730f26ec98cSKris Buschelman       v        = aa + ai16;
3731f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3732f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3733f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3734f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3735f26ec98cSKris Buschelman       idt -= 4;
3736f26ec98cSKris Buschelman     }
3737f26ec98cSKris Buschelman   }
3738f26ec98cSKris Buschelman 
37391ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
37401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3741dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3742f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3743f26ec98cSKris Buschelman }
3744f26ec98cSKris Buschelman 
37453660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
37463660e330SKris Buschelman 
37473660e330SKris Buschelman #include PETSC_HAVE_SSE
37483660e330SKris Buschelman #undef __FUNCT__
37497cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3750dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
37513660e330SKris Buschelman {
37523660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
37532aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3754dfbe8321SBarry Smith   PetscErrorCode ierr;
3755dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
37563660e330SKris Buschelman   MatScalar      *aa=a->a;
375787828ca2SBarry Smith   PetscScalar    *x,*b;
37583660e330SKris Buschelman 
37593660e330SKris Buschelman   PetscFunctionBegin;
37603660e330SKris Buschelman   SSE_SCOPE_BEGIN;
37613660e330SKris Buschelman   /*
37623660e330SKris Buschelman      Note: This code currently uses demotion of double
37633660e330SKris Buschelman      to float when performing the mixed-mode computation.
37643660e330SKris Buschelman      This may not be numerically reasonable for all applications.
37653660e330SKris Buschelman   */
37663660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
37673660e330SKris Buschelman 
37681ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
37691ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
37703660e330SKris Buschelman   {
3771eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3772eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
37732aa5897fSKris Buschelman     int            nz,i,idt,ai16;
37742aa5897fSKris Buschelman     unsigned int   jdx,idx;
37752aa5897fSKris Buschelman     unsigned short *vi;
3776eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
37773660e330SKris Buschelman 
3778eb05f457SKris Buschelman     /* First block is the identity. */
37793660e330SKris Buschelman     idx  = 0;
3780eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
37812aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
37823660e330SKris Buschelman 
37833660e330SKris Buschelman     for (i=1; i<n;) {
37843660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
37853660e330SKris Buschelman       vi   =  aj      + ai[i];
37863660e330SKris Buschelman       nz   =  diag[i] - ai[i];
37873660e330SKris Buschelman       idx +=  4;
37883660e330SKris Buschelman 
3789eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3790eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3791eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
37923660e330SKris Buschelman 
37933660e330SKris Buschelman       while (nz--) {
37943660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
37952aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
37963660e330SKris Buschelman 
37973660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3798eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
37993660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
38003660e330SKris Buschelman 
38013660e330SKris Buschelman           /* First Column */
38023660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
38033660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
38043660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
38053660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
38063660e330SKris Buschelman 
38073660e330SKris Buschelman           /* Second Column */
38083660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
38093660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
38103660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
38113660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
38123660e330SKris Buschelman 
38133660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
38143660e330SKris Buschelman 
38153660e330SKris Buschelman           /* Third Column */
38163660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
38173660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
38183660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
38193660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
38203660e330SKris Buschelman 
38213660e330SKris Buschelman           /* Fourth Column */
38223660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
38233660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
38243660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
38253660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
38263660e330SKris Buschelman         SSE_INLINE_END_2
38273660e330SKris Buschelman 
38283660e330SKris Buschelman         v  += 16;
38293660e330SKris Buschelman       }
38303660e330SKris Buschelman       v    =  aa + 16*ai[++i];
38313660e330SKris Buschelman       PREFETCH_NTA(v);
3832eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
38333660e330SKris Buschelman     }
3834eb05f457SKris Buschelman 
3835eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3836eb05f457SKris Buschelman 
38373660e330SKris Buschelman     idt  = 4*(n-1);
38383660e330SKris Buschelman     ai16 = 16*diag[n-1];
38393660e330SKris Buschelman     v    = aa + ai16 + 16;
38403660e330SKris Buschelman     for (i=n-1; i>=0;){
38413660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
38423660e330SKris Buschelman       vi = aj + diag[i] + 1;
38433660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
38443660e330SKris Buschelman 
3845eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
38463660e330SKris Buschelman 
38473660e330SKris Buschelman       while (nz--) {
38483660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
38492aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
38503660e330SKris Buschelman 
38513660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3852eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
38533660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
38543660e330SKris Buschelman 
38553660e330SKris Buschelman           /* First Column */
38563660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
38573660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
38583660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
38593660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
38603660e330SKris Buschelman 
38613660e330SKris Buschelman           /* Second Column */
38623660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
38633660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
38643660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
38653660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
38663660e330SKris Buschelman 
38673660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
38683660e330SKris Buschelman 
38693660e330SKris Buschelman           /* Third Column */
38703660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
38713660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
38723660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
38733660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
38743660e330SKris Buschelman 
38753660e330SKris Buschelman           /* Fourth Column */
38763660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
38773660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
38783660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
38793660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
38803660e330SKris Buschelman         SSE_INLINE_END_2
38813660e330SKris Buschelman         v  += 16;
38823660e330SKris Buschelman       }
38833660e330SKris Buschelman       v    = aa + ai16;
38843660e330SKris Buschelman       ai16 = 16*diag[--i];
38853660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
38863660e330SKris Buschelman       /*
38873660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
38883660e330SKris Buschelman          which was inverted as part of the factorization
38893660e330SKris Buschelman       */
3890eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
38913660e330SKris Buschelman         /* First Column */
38923660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
38933660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
38943660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
38953660e330SKris Buschelman 
38963660e330SKris Buschelman         /* Second Column */
38973660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
38983660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
38993660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
39003660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
39013660e330SKris Buschelman 
39023660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
39033660e330SKris Buschelman 
39043660e330SKris Buschelman         /* Third Column */
39053660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
39063660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
39073660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
39083660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
39093660e330SKris Buschelman 
39103660e330SKris Buschelman         /* Fourth Column */
39113660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
39123660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
39133660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
39143660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
39153660e330SKris Buschelman 
39163660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
39173660e330SKris Buschelman       SSE_INLINE_END_3
39183660e330SKris Buschelman 
39193660e330SKris Buschelman       v    = aa + ai16 + 16;
39203660e330SKris Buschelman       idt -= 4;
39213660e330SKris Buschelman     }
3922eb05f457SKris Buschelman 
3923eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3924eb05f457SKris Buschelman     idt = 4*(n-1);
3925eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3926eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3927eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3928eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3929eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3930eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3931eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3932eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3933eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
393454693613SKris Buschelman       idt -= 4;
39353660e330SKris Buschelman     }
3936eb05f457SKris Buschelman 
3937eb05f457SKris Buschelman   } /* End of artificial scope. */
39381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
39391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3940dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
39413660e330SKris Buschelman   SSE_SCOPE_END;
39423660e330SKris Buschelman   PetscFunctionReturn(0);
39433660e330SKris Buschelman }
39443660e330SKris Buschelman 
39457cf1b8d3SKris Buschelman #undef __FUNCT__
39467cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3947dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
39487cf1b8d3SKris Buschelman {
39497cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
39507cf1b8d3SKris Buschelman   int            *aj=a->j;
3951dfbe8321SBarry Smith   PetscErrorCode ierr;
3952dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
39537cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
39547cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
39557cf1b8d3SKris Buschelman 
39567cf1b8d3SKris Buschelman   PetscFunctionBegin;
39577cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
39587cf1b8d3SKris Buschelman   /*
39597cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
39607cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
39617cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
39627cf1b8d3SKris Buschelman   */
39637cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
39647cf1b8d3SKris Buschelman 
39651ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
39661ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
39677cf1b8d3SKris Buschelman   {
39687cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
39697cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
39707cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
39717cf1b8d3SKris Buschelman     int       jdx,idx;
39727cf1b8d3SKris Buschelman     int       *vi;
39737cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
39747cf1b8d3SKris Buschelman 
39757cf1b8d3SKris Buschelman     /* First block is the identity. */
39767cf1b8d3SKris Buschelman     idx  = 0;
39777cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
39787cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
39797cf1b8d3SKris Buschelman 
39807cf1b8d3SKris Buschelman     for (i=1; i<n;) {
39817cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
39827cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
39837cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
39847cf1b8d3SKris Buschelman       idx +=  4;
39857cf1b8d3SKris Buschelman 
39867cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
39877cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
39887cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
39897cf1b8d3SKris Buschelman 
39907cf1b8d3SKris Buschelman       while (nz--) {
39917cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
39927cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
39937cf1b8d3SKris Buschelman /*          jdx = *vi++; */
39947cf1b8d3SKris Buschelman 
39957cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
39967cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
39977cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
39987cf1b8d3SKris Buschelman 
39997cf1b8d3SKris Buschelman           /* First Column */
40007cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
40017cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
40027cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
40037cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
40047cf1b8d3SKris Buschelman 
40057cf1b8d3SKris Buschelman           /* Second Column */
40067cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
40077cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
40087cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
40097cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
40107cf1b8d3SKris Buschelman 
40117cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
40127cf1b8d3SKris Buschelman 
40137cf1b8d3SKris Buschelman           /* Third Column */
40147cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
40157cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
40167cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
40177cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
40187cf1b8d3SKris Buschelman 
40197cf1b8d3SKris Buschelman           /* Fourth Column */
40207cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
40217cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
40227cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
40237cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
40247cf1b8d3SKris Buschelman         SSE_INLINE_END_2
40257cf1b8d3SKris Buschelman 
40267cf1b8d3SKris Buschelman         v  += 16;
40277cf1b8d3SKris Buschelman       }
40287cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
40297cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
40307cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
40317cf1b8d3SKris Buschelman     }
40327cf1b8d3SKris Buschelman 
40337cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
40347cf1b8d3SKris Buschelman 
40357cf1b8d3SKris Buschelman     idt  = 4*(n-1);
40367cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
40377cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
40387cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
40397cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
40407cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
40417cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
40427cf1b8d3SKris Buschelman 
40437cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
40447cf1b8d3SKris Buschelman 
40457cf1b8d3SKris Buschelman       while (nz--) {
40467cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
40477cf1b8d3SKris Buschelman         idx = 4*(*vi++);
40487cf1b8d3SKris Buschelman /*          idx = *vi++; */
40497cf1b8d3SKris Buschelman 
40507cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
40517cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
40527cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
40537cf1b8d3SKris Buschelman 
40547cf1b8d3SKris Buschelman           /* First Column */
40557cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
40567cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
40577cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
40587cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
40597cf1b8d3SKris Buschelman 
40607cf1b8d3SKris Buschelman           /* Second Column */
40617cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
40627cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
40637cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
40647cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
40657cf1b8d3SKris Buschelman 
40667cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
40677cf1b8d3SKris Buschelman 
40687cf1b8d3SKris Buschelman           /* Third Column */
40697cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
40707cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
40717cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
40727cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
40737cf1b8d3SKris Buschelman 
40747cf1b8d3SKris Buschelman           /* Fourth Column */
40757cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
40767cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
40777cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
40787cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
40797cf1b8d3SKris Buschelman         SSE_INLINE_END_2
40807cf1b8d3SKris Buschelman         v  += 16;
40817cf1b8d3SKris Buschelman       }
40827cf1b8d3SKris Buschelman       v    = aa + ai16;
40837cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
40847cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
40857cf1b8d3SKris Buschelman       /*
40867cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
40877cf1b8d3SKris Buschelman          which was inverted as part of the factorization
40887cf1b8d3SKris Buschelman       */
40897cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
40907cf1b8d3SKris Buschelman         /* First Column */
40917cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
40927cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
40937cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
40947cf1b8d3SKris Buschelman 
40957cf1b8d3SKris Buschelman         /* Second Column */
40967cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
40977cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
40987cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
40997cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
41007cf1b8d3SKris Buschelman 
41017cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
41027cf1b8d3SKris Buschelman 
41037cf1b8d3SKris Buschelman         /* Third Column */
41047cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
41057cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
41067cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
41077cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
41087cf1b8d3SKris Buschelman 
41097cf1b8d3SKris Buschelman         /* Fourth Column */
41107cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
41117cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
41127cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
41137cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
41147cf1b8d3SKris Buschelman 
41157cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
41167cf1b8d3SKris Buschelman       SSE_INLINE_END_3
41177cf1b8d3SKris Buschelman 
41187cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
41197cf1b8d3SKris Buschelman       idt -= 4;
41207cf1b8d3SKris Buschelman     }
41217cf1b8d3SKris Buschelman 
41227cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
41237cf1b8d3SKris Buschelman     idt = 4*(n-1);
41247cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
41257cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
41267cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
41277cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
41287cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
41297cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
41307cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
41317cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
41327cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
41337cf1b8d3SKris Buschelman       idt -= 4;
41347cf1b8d3SKris Buschelman     }
41357cf1b8d3SKris Buschelman 
41367cf1b8d3SKris Buschelman   } /* End of artificial scope. */
41371ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
41381ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4139dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
41407cf1b8d3SKris Buschelman   SSE_SCOPE_END;
41417cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
41427cf1b8d3SKris Buschelman }
41437cf1b8d3SKris Buschelman 
41443660e330SKris Buschelman #endif
41458f690400SShri Abhyankar 
41464a2ae208SSatish Balay #undef __FUNCT__
41474a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4148dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
41494e2b4712SSatish Balay {
41504e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
41514e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
41526849ba73SBarry Smith   PetscErrorCode    ierr;
41535d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
41545d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4155d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4156d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4157d9fead3dSBarry Smith   const PetscScalar *b;
41584e2b4712SSatish Balay 
41594e2b4712SSatish Balay   PetscFunctionBegin;
4160d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41611ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4162f1af5d2fSBarry Smith   t  = a->solve_work;
41634e2b4712SSatish Balay 
41644e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
41654e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
41664e2b4712SSatish Balay 
41674e2b4712SSatish Balay   /* forward solve the lower triangular */
41684e2b4712SSatish Balay   idx    = 3*(*r++);
4169f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
41704e2b4712SSatish Balay   for (i=1; i<n; i++) {
41714e2b4712SSatish Balay     v     = aa + 9*ai[i];
41724e2b4712SSatish Balay     vi    = aj + ai[i];
41734e2b4712SSatish Balay     nz    = diag[i] - ai[i];
41744e2b4712SSatish Balay     idx   = 3*(*r++);
4175f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
41764e2b4712SSatish Balay     while (nz--) {
41774e2b4712SSatish Balay       idx   = 3*(*vi++);
4178f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4179f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4180f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4181f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
41824e2b4712SSatish Balay       v += 9;
41834e2b4712SSatish Balay     }
41844e2b4712SSatish Balay     idx = 3*i;
4185f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
41864e2b4712SSatish Balay   }
41874e2b4712SSatish Balay   /* backward solve the upper triangular */
41884e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
41894e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
41904e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
41914e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
41924e2b4712SSatish Balay     idt  = 3*i;
4193f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
41944e2b4712SSatish Balay     while (nz--) {
41954e2b4712SSatish Balay       idx   = 3*(*vi++);
4196f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4197f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4198f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4199f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
42004e2b4712SSatish Balay       v += 9;
42014e2b4712SSatish Balay     }
42024e2b4712SSatish Balay     idc = 3*(*c--);
42034e2b4712SSatish Balay     v   = aa + 9*diag[i];
4204f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4205f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4206f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
42074e2b4712SSatish Balay   }
42084e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
42094e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4210d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4212dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
42134e2b4712SSatish Balay   PetscFunctionReturn(0);
42144e2b4712SSatish Balay }
42154e2b4712SSatish Balay 
4216*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
42178f690400SShri Abhyankar #undef __FUNCT__
42188f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
42198f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
42208f690400SShri Abhyankar {
42218f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
42228f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
42238f690400SShri Abhyankar   PetscErrorCode    ierr;
422429b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
42258f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
42268f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
42278f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
42288f690400SShri Abhyankar   const PetscScalar *b;
42298f690400SShri Abhyankar 
42308f690400SShri Abhyankar   PetscFunctionBegin;
42318f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42328f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
42338f690400SShri Abhyankar   t  = a->solve_work;
42348f690400SShri Abhyankar 
42358f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
423629b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
42378f690400SShri Abhyankar 
42388f690400SShri Abhyankar   /* forward solve the lower triangular */
423929b92fc1SShri Abhyankar   idx    = 3*r[0];
42408f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
42418f690400SShri Abhyankar   for (i=1; i<n; i++) {
42428f690400SShri Abhyankar     v     = aa + 9*ai[i];
42438f690400SShri Abhyankar     vi    = aj + ai[i];
42448f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
424529b92fc1SShri Abhyankar     idx   = 3*r[i];
42468f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
424729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
424829b92fc1SShri Abhyankar       idx   = 3*vi[m];
42498f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
42508f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
42518f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
42528f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
42538f690400SShri Abhyankar       v += 9;
42548f690400SShri Abhyankar     }
42558f690400SShri Abhyankar     idx = 3*i;
42568f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
42578f690400SShri Abhyankar   }
42588f690400SShri Abhyankar   /* backward solve the upper triangular */
42598f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
42608f690400SShri Abhyankar     k    = 2*n-i;
42618f690400SShri Abhyankar     v    = aa + 9*ai[k];
42628f690400SShri Abhyankar     vi   = aj + ai[k];
42638f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
42648f690400SShri Abhyankar     idt  = 3*i;
42658f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
426629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
426729b92fc1SShri Abhyankar       idx   = 3*vi[m];
42688f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
42698f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
42708f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
42718f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
42728f690400SShri Abhyankar       v += 9;
42738f690400SShri Abhyankar     }
427429b92fc1SShri Abhyankar     idc = 3*c[i];
42758f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
42768f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
42778f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
42788f690400SShri Abhyankar   }
42798f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
42808f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
42818f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42828f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
42838f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
42848f690400SShri Abhyankar   PetscFunctionReturn(0);
42858f690400SShri Abhyankar }
4286*a2d6a19aSShri Abhyankar #endif
42878f690400SShri Abhyankar 
42880c4413a7SShri Abhyankar #undef __FUNCT__
4289*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4290*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
42910c4413a7SShri Abhyankar {
42920c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
42930c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
42940c4413a7SShri Abhyankar   PetscErrorCode    ierr;
42950c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
42960c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
42970c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
42980c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
42990c4413a7SShri Abhyankar   const PetscScalar *b;
43000c4413a7SShri Abhyankar 
43010c4413a7SShri Abhyankar   PetscFunctionBegin;
43020c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43030c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
43040c4413a7SShri Abhyankar   t  = a->solve_work;
43050c4413a7SShri Abhyankar 
43060c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
43070c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
43080c4413a7SShri Abhyankar 
43090c4413a7SShri Abhyankar   /* forward solve the lower triangular */
43100c4413a7SShri Abhyankar   idx    = 3*r[0];
43110c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
43120c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
43130c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
43140c4413a7SShri Abhyankar     vi    = aj + ai[i];
43150c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
43160c4413a7SShri Abhyankar     idx   = 3*r[i];
43170c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
43180c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
43190c4413a7SShri Abhyankar       idx   = 3*vi[m];
43200c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
43210c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
43220c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
43230c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
43240c4413a7SShri Abhyankar       v += 9;
43250c4413a7SShri Abhyankar     }
43260c4413a7SShri Abhyankar     idx = 3*i;
43270c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
43280c4413a7SShri Abhyankar   }
43290c4413a7SShri Abhyankar   /* backward solve the upper triangular */
43300c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
43310c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
43320c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
43330c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
43340c4413a7SShri Abhyankar     idt  = 3*i;
43350c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
43360c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
43370c4413a7SShri Abhyankar       idx   = 3*vi[m];
43380c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
43390c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
43400c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
43410c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
43420c4413a7SShri Abhyankar       v += 9;
43430c4413a7SShri Abhyankar     }
43440c4413a7SShri Abhyankar     idc = 3*c[i];
43450c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
43460c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
43470c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
43480c4413a7SShri Abhyankar   }
43490c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
43500c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
43510c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43520c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
43530c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
43540c4413a7SShri Abhyankar   PetscFunctionReturn(0);
43550c4413a7SShri Abhyankar }
43560c4413a7SShri Abhyankar 
435715091d37SBarry Smith /*
435815091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
435915091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
436015091d37SBarry Smith */
43614a2ae208SSatish Balay #undef __FUNCT__
43624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4363dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
436415091d37SBarry Smith {
436515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4366690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4367dfbe8321SBarry Smith   PetscErrorCode    ierr;
4368690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4369d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4370d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4371d9fead3dSBarry Smith   const PetscScalar *b;
4372690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
437315091d37SBarry Smith 
437415091d37SBarry Smith   PetscFunctionBegin;
4375d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
437715091d37SBarry Smith 
437815091d37SBarry Smith   /* forward solve the lower triangular */
437915091d37SBarry Smith   idx    = 0;
438015091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
438115091d37SBarry Smith   for (i=1; i<n; i++) {
438215091d37SBarry Smith     v     =  aa      + 9*ai[i];
438315091d37SBarry Smith     vi    =  aj      + ai[i];
438415091d37SBarry Smith     nz    =  diag[i] - ai[i];
438515091d37SBarry Smith     idx   +=  3;
4386f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
438715091d37SBarry Smith     while (nz--) {
438815091d37SBarry Smith       jdx   = 3*(*vi++);
438915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4390f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4391f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4392f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
439315091d37SBarry Smith       v    += 9;
439415091d37SBarry Smith     }
4395f1af5d2fSBarry Smith     x[idx]   = s1;
4396f1af5d2fSBarry Smith     x[1+idx] = s2;
4397f1af5d2fSBarry Smith     x[2+idx] = s3;
439815091d37SBarry Smith   }
439915091d37SBarry Smith   /* backward solve the upper triangular */
440015091d37SBarry Smith   for (i=n-1; i>=0; i--){
440115091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
440215091d37SBarry Smith     vi   = aj + diag[i] + 1;
440315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
440415091d37SBarry Smith     idt  = 3*i;
4405f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4406f1af5d2fSBarry Smith     s3 = x[2+idt];
440715091d37SBarry Smith     while (nz--) {
440815091d37SBarry Smith       idx   = 3*(*vi++);
440915091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4410f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4411f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4412f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
441315091d37SBarry Smith       v    += 9;
441415091d37SBarry Smith     }
441515091d37SBarry Smith     v        = aa +  9*diag[i];
4416f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4417f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4418f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
441915091d37SBarry Smith   }
442015091d37SBarry Smith 
4421d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4423dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
442415091d37SBarry Smith   PetscFunctionReturn(0);
442515091d37SBarry Smith }
442615091d37SBarry Smith 
4427*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
44284a2ae208SSatish Balay #undef __FUNCT__
4429cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4430cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4431cee9d6f2SShri Abhyankar {
4432cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4433ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4434cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4435cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
4436cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4437cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4438cee9d6f2SShri Abhyankar     PetscScalar       *x;
4439cee9d6f2SShri Abhyankar     const PetscScalar *b;
4440cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4441cee9d6f2SShri Abhyankar 
4442cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4443cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4444cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4445cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4446cee9d6f2SShri Abhyankar     idx    = 0;
4447cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4448cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4449cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
4450cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4451cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4452cee9d6f2SShri Abhyankar       idx   = bs*i;
4453cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4454ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4455ce3d78c0SShri Abhyankar          jdx   = bs*vi[k];
4456cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4457cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4458cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4459cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4460cee9d6f2SShri Abhyankar 
4461cee9d6f2SShri Abhyankar           v   +=  bs2;
4462cee9d6f2SShri Abhyankar         }
4463cee9d6f2SShri Abhyankar 
4464cee9d6f2SShri Abhyankar        x[idx]   = s1;
4465cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4466cee9d6f2SShri Abhyankar        x[2+idx] = s3;
4467cee9d6f2SShri Abhyankar     }
4468cee9d6f2SShri Abhyankar 
4469cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4470cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4471cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
4472cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4473cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4474cee9d6f2SShri Abhyankar      idt = bs*i;
4475cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4476cee9d6f2SShri Abhyankar 
4477ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4478ce3d78c0SShri Abhyankar        idx   = bs*vi[k];
4479cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4480cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4481cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4482cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4483cee9d6f2SShri Abhyankar 
4484cee9d6f2SShri Abhyankar         v   +=  bs2;
4485cee9d6f2SShri Abhyankar     }
4486cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4487cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4488cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4489cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4490cee9d6f2SShri Abhyankar 
4491cee9d6f2SShri Abhyankar   }
4492cee9d6f2SShri Abhyankar 
4493cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4494cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4495cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4496cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4497cee9d6f2SShri Abhyankar }
4498*a2d6a19aSShri Abhyankar #endif
4499cee9d6f2SShri Abhyankar 
4500cee9d6f2SShri Abhyankar #undef __FUNCT__
4501*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4502*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4503b2b2dd24SShri Abhyankar {
4504b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4505b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4506b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4507b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
4508b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4509b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4510b2b2dd24SShri Abhyankar     PetscScalar       *x;
4511b2b2dd24SShri Abhyankar     const PetscScalar *b;
4512b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4513b2b2dd24SShri Abhyankar 
4514b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4515b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4516b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4517b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4518b2b2dd24SShri Abhyankar     idx    = 0;
4519b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4520b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4521b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4522b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4523b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4524b2b2dd24SShri Abhyankar       idx   = bs*i;
4525b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4526b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4527b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4528b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4529b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4530b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4531b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4532b2b2dd24SShri Abhyankar 
4533b2b2dd24SShri Abhyankar           v   +=  bs2;
4534b2b2dd24SShri Abhyankar         }
4535b2b2dd24SShri Abhyankar 
4536b2b2dd24SShri Abhyankar        x[idx]   = s1;
4537b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4538b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4539b2b2dd24SShri Abhyankar     }
4540b2b2dd24SShri Abhyankar 
4541b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4542b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4543b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4544b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4545b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4546b2b2dd24SShri Abhyankar      idt = bs*i;
4547b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4548b2b2dd24SShri Abhyankar 
4549b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4550b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4551b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4552b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4553b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4554b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4555b2b2dd24SShri Abhyankar 
4556b2b2dd24SShri Abhyankar         v   +=  bs2;
4557b2b2dd24SShri Abhyankar     }
4558b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4559b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4560b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4561b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4562b2b2dd24SShri Abhyankar 
4563b2b2dd24SShri Abhyankar   }
4564b2b2dd24SShri Abhyankar 
4565b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4566b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4567b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4568b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4569b2b2dd24SShri Abhyankar }
4570b2b2dd24SShri Abhyankar 
4571b2b2dd24SShri Abhyankar #undef __FUNCT__
45724a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4573dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
45744e2b4712SSatish Balay {
45754e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
45764e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
45776849ba73SBarry Smith   PetscErrorCode    ierr;
45785d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
45795d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4580d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4581d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4582d9fead3dSBarry Smith   const PetscScalar *b;
45834e2b4712SSatish Balay 
45844e2b4712SSatish Balay   PetscFunctionBegin;
4585d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45861ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4587f1af5d2fSBarry Smith   t  = a->solve_work;
45884e2b4712SSatish Balay 
45894e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
45904e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
45914e2b4712SSatish Balay 
45924e2b4712SSatish Balay   /* forward solve the lower triangular */
45934e2b4712SSatish Balay   idx    = 2*(*r++);
4594f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
45954e2b4712SSatish Balay   for (i=1; i<n; i++) {
45964e2b4712SSatish Balay     v     = aa + 4*ai[i];
45974e2b4712SSatish Balay     vi    = aj + ai[i];
45984e2b4712SSatish Balay     nz    = diag[i] - ai[i];
45994e2b4712SSatish Balay     idx   = 2*(*r++);
4600f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
46014e2b4712SSatish Balay     while (nz--) {
46024e2b4712SSatish Balay       idx   = 2*(*vi++);
4603f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4604f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4605f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
46064e2b4712SSatish Balay       v += 4;
46074e2b4712SSatish Balay     }
46084e2b4712SSatish Balay     idx = 2*i;
4609f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
46104e2b4712SSatish Balay   }
46114e2b4712SSatish Balay   /* backward solve the upper triangular */
46124e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
46134e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
46144e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
46154e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
46164e2b4712SSatish Balay     idt  = 2*i;
4617f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
46184e2b4712SSatish Balay     while (nz--) {
46194e2b4712SSatish Balay       idx   = 2*(*vi++);
4620f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4621f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4622f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
46234e2b4712SSatish Balay       v += 4;
46244e2b4712SSatish Balay     }
46254e2b4712SSatish Balay     idc = 2*(*c--);
46264e2b4712SSatish Balay     v   = aa + 4*diag[i];
4627f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4628f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
46294e2b4712SSatish Balay   }
46304e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
46314e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4632d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4634dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
46354e2b4712SSatish Balay   PetscFunctionReturn(0);
46364e2b4712SSatish Balay }
46374e2b4712SSatish Balay 
4638*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
46398f690400SShri Abhyankar #undef __FUNCT__
46408f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
46418f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
46428f690400SShri Abhyankar {
46438f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
46448f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
46458f690400SShri Abhyankar   PetscErrorCode    ierr;
464629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
46478f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
46488f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
46498f690400SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
46508f690400SShri Abhyankar   const PetscScalar *b;
46518f690400SShri Abhyankar 
46528f690400SShri Abhyankar   PetscFunctionBegin;
46538f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46548f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
46558f690400SShri Abhyankar   t  = a->solve_work;
46568f690400SShri Abhyankar 
46578f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
465829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
46598f690400SShri Abhyankar 
46608f690400SShri Abhyankar   /* forward solve the lower triangular */
466129b92fc1SShri Abhyankar   idx    = 2*r[0];
46628f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
46638f690400SShri Abhyankar   for (i=1; i<n; i++) {
46648f690400SShri Abhyankar     v     = aa + 4*ai[i];
46658f690400SShri Abhyankar     vi    = aj + ai[i];
46668f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
466729b92fc1SShri Abhyankar     idx   = 2*r[i];
46688f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
466929b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
467029b92fc1SShri Abhyankar       jdx   = 2*vi[m];
46718f690400SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
46728f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
46738f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
46748f690400SShri Abhyankar       v += 4;
46758f690400SShri Abhyankar     }
46768f690400SShri Abhyankar     idx = 2*i;
46778f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
46788f690400SShri Abhyankar   }
46798f690400SShri Abhyankar   /* backward solve the upper triangular */
46808f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
46818f690400SShri Abhyankar     k = 2*n-i;
46828f690400SShri Abhyankar     v    = aa + 4*ai[k];
46838f690400SShri Abhyankar     vi   = aj + ai[k];
46848f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
46858f690400SShri Abhyankar     idt  = 2*i;
46868f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
468729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
468829b92fc1SShri Abhyankar       idx   = 2*vi[m];
46898f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
46908f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
46918f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
46928f690400SShri Abhyankar       v += 4;
46938f690400SShri Abhyankar     }
469429b92fc1SShri Abhyankar     idc = 2*c[i];
46958f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
46968f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
46978f690400SShri Abhyankar   }
46988f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
46998f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
47008f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47018f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
47028f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
47038f690400SShri Abhyankar   PetscFunctionReturn(0);
47048f690400SShri Abhyankar }
4705*a2d6a19aSShri Abhyankar #endif
47068f690400SShri Abhyankar 
47070c4413a7SShri Abhyankar #undef __FUNCT__
4708*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4709*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
47100c4413a7SShri Abhyankar {
47110c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
47120c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
47130c4413a7SShri Abhyankar   PetscErrorCode    ierr;
47140c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
47150c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
47160c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
47170c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
47180c4413a7SShri Abhyankar   const PetscScalar *b;
47190c4413a7SShri Abhyankar 
47200c4413a7SShri Abhyankar   PetscFunctionBegin;
47210c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47220c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
47230c4413a7SShri Abhyankar   t  = a->solve_work;
47240c4413a7SShri Abhyankar 
47250c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47260c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
47270c4413a7SShri Abhyankar 
47280c4413a7SShri Abhyankar   /* forward solve the lower triangular */
47290c4413a7SShri Abhyankar   idx    = 2*r[0];
47300c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
47310c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
47320c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
47330c4413a7SShri Abhyankar     vi    = aj + ai[i];
47340c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
47350c4413a7SShri Abhyankar     idx   = 2*r[i];
47360c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
47370c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
47380c4413a7SShri Abhyankar       jdx   = 2*vi[m];
47390c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
47400c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
47410c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
47420c4413a7SShri Abhyankar       v += 4;
47430c4413a7SShri Abhyankar     }
47440c4413a7SShri Abhyankar     idx = 2*i;
47450c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
47460c4413a7SShri Abhyankar   }
47470c4413a7SShri Abhyankar   /* backward solve the upper triangular */
47480c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
47490c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
47500c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
47510c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
47520c4413a7SShri Abhyankar     idt  = 2*i;
47530c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
47540c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
47550c4413a7SShri Abhyankar       idx   = 2*vi[m];
47560c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
47570c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
47580c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
47590c4413a7SShri Abhyankar       v += 4;
47600c4413a7SShri Abhyankar     }
47610c4413a7SShri Abhyankar     idc = 2*c[i];
47620c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
47630c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
47640c4413a7SShri Abhyankar   }
47650c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
47660c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
47670c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47680c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
47690c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
47700c4413a7SShri Abhyankar   PetscFunctionReturn(0);
47710c4413a7SShri Abhyankar }
47728f690400SShri Abhyankar 
477315091d37SBarry Smith /*
477415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
477515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
477615091d37SBarry Smith */
47774a2ae208SSatish Balay #undef __FUNCT__
47784a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4779dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
478015091d37SBarry Smith {
478115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4782690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4783dfbe8321SBarry Smith   PetscErrorCode    ierr;
4784690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4785d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4786d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
4787d9fead3dSBarry Smith   const PetscScalar *b;
4788690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
478915091d37SBarry Smith 
479015091d37SBarry Smith   PetscFunctionBegin;
4791d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
479315091d37SBarry Smith 
479415091d37SBarry Smith   /* forward solve the lower triangular */
479515091d37SBarry Smith   idx    = 0;
479615091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
479715091d37SBarry Smith   for (i=1; i<n; i++) {
479815091d37SBarry Smith     v     =  aa      + 4*ai[i];
479915091d37SBarry Smith     vi    =  aj      + ai[i];
480015091d37SBarry Smith     nz    =  diag[i] - ai[i];
480115091d37SBarry Smith     idx   +=  2;
4802f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
480315091d37SBarry Smith     while (nz--) {
480415091d37SBarry Smith       jdx   = 2*(*vi++);
480515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
4806f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4807f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
480815091d37SBarry Smith       v    += 4;
480915091d37SBarry Smith     }
4810f1af5d2fSBarry Smith     x[idx]   = s1;
4811f1af5d2fSBarry Smith     x[1+idx] = s2;
481215091d37SBarry Smith   }
481315091d37SBarry Smith   /* backward solve the upper triangular */
481415091d37SBarry Smith   for (i=n-1; i>=0; i--){
481515091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
481615091d37SBarry Smith     vi   = aj + diag[i] + 1;
481715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
481815091d37SBarry Smith     idt  = 2*i;
4819f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
482015091d37SBarry Smith     while (nz--) {
482115091d37SBarry Smith       idx   = 2*(*vi++);
482215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
4823f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4824f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
482515091d37SBarry Smith       v    += 4;
482615091d37SBarry Smith     }
482715091d37SBarry Smith     v        = aa +  4*diag[i];
4828f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
4829f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
483015091d37SBarry Smith   }
483115091d37SBarry Smith 
4832d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
48331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4834dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
483515091d37SBarry Smith   PetscFunctionReturn(0);
483615091d37SBarry Smith }
483715091d37SBarry Smith 
4838*a2d6a19aSShri Abhyankar #if defined(OLD_ROUTINE_TO_BE_REPLACED)
48394a2ae208SSatish Balay #undef __FUNCT__
4840cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4841cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4842cee9d6f2SShri Abhyankar {
4843cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4844ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4845cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4846cee9d6f2SShri Abhyankar     PetscInt          jdx;
4847cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4848cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4849cee9d6f2SShri Abhyankar     const PetscScalar *b;
4850cee9d6f2SShri Abhyankar 
4851cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4852cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4853cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4854cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4855cee9d6f2SShri Abhyankar     idx    = 0;
4856cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4857cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4858cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
4859cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4860cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4861cee9d6f2SShri Abhyankar        idx  = 2*i;
4862cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4863ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4864ce3d78c0SShri Abhyankar          jdx   = 2*vi[k];
4865cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4866cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4867cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4868cee9d6f2SShri Abhyankar            v   +=  4;
4869cee9d6f2SShri Abhyankar         }
4870cee9d6f2SShri Abhyankar        x[idx]   = s1;
4871cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4872cee9d6f2SShri Abhyankar     }
4873cee9d6f2SShri Abhyankar 
4874cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4875cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4876cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
4877cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4878cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4879cee9d6f2SShri Abhyankar      idt = 2*i;
4880cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4881ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4882ce3d78c0SShri Abhyankar       idx   = 2*vi[k];
4883cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4884cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4885cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4886cee9d6f2SShri Abhyankar          v    += 4;
4887cee9d6f2SShri Abhyankar     }
4888cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4889cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4890cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4891cee9d6f2SShri Abhyankar   }
4892cee9d6f2SShri Abhyankar 
4893cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4894cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4895cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4896cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4897cee9d6f2SShri Abhyankar }
4898*a2d6a19aSShri Abhyankar #endif
4899cee9d6f2SShri Abhyankar 
4900cee9d6f2SShri Abhyankar #undef __FUNCT__
4901*a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4902*a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4903b2b2dd24SShri Abhyankar {
4904b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4905b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4906b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4907b2b2dd24SShri Abhyankar     PetscInt          jdx;
4908b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4909b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4910b2b2dd24SShri Abhyankar     const PetscScalar *b;
4911b2b2dd24SShri Abhyankar 
4912b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4913b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4914b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4915b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4916b2b2dd24SShri Abhyankar     idx    = 0;
4917b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4918b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4919b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4920b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4921b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4922b2b2dd24SShri Abhyankar        idx  = 2*i;
4923b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4924b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4925b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4926b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4927b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4928b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4929b2b2dd24SShri Abhyankar            v   +=  4;
4930b2b2dd24SShri Abhyankar         }
4931b2b2dd24SShri Abhyankar        x[idx]   = s1;
4932b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4933b2b2dd24SShri Abhyankar     }
4934b2b2dd24SShri Abhyankar 
4935b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4936b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4937b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4938b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4939b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4940b2b2dd24SShri Abhyankar      idt = 2*i;
4941b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4942b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4943b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4944b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4945b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4946b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4947b2b2dd24SShri Abhyankar          v    += 4;
4948b2b2dd24SShri Abhyankar     }
4949b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4950b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4951b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4952b2b2dd24SShri Abhyankar   }
4953b2b2dd24SShri Abhyankar 
4954b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4955b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4956b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4957b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4958b2b2dd24SShri Abhyankar }
4959b2b2dd24SShri Abhyankar 
4960b2b2dd24SShri Abhyankar #undef __FUNCT__
49614a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4962dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
49634e2b4712SSatish Balay {
49644e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
49654e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
49666849ba73SBarry Smith   PetscErrorCode ierr;
49675d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
49685d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
49693f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
497087828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
49714e2b4712SSatish Balay 
49724e2b4712SSatish Balay   PetscFunctionBegin;
49734e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
49744e2b4712SSatish Balay 
49751ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
49761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4977f1af5d2fSBarry Smith   t  = a->solve_work;
49784e2b4712SSatish Balay 
49794e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
49804e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
49814e2b4712SSatish Balay 
49824e2b4712SSatish Balay   /* forward solve the lower triangular */
4983f1af5d2fSBarry Smith   t[0] = b[*r++];
49844e2b4712SSatish Balay   for (i=1; i<n; i++) {
49854e2b4712SSatish Balay     v     = aa + ai[i];
49864e2b4712SSatish Balay     vi    = aj + ai[i];
49874e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4988f1af5d2fSBarry Smith     s1  = b[*r++];
49894e2b4712SSatish Balay     while (nz--) {
4990f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
49914e2b4712SSatish Balay     }
4992f1af5d2fSBarry Smith     t[i] = s1;
49934e2b4712SSatish Balay   }
49944e2b4712SSatish Balay   /* backward solve the upper triangular */
49954e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
49964e2b4712SSatish Balay     v    = aa + diag[i] + 1;
49974e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
49984e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4999f1af5d2fSBarry Smith     s1 = t[i];
50004e2b4712SSatish Balay     while (nz--) {
5001f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
50024e2b4712SSatish Balay     }
5003f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
50044e2b4712SSatish Balay   }
50054e2b4712SSatish Balay 
50064e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
50074e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
50081ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50091ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5010dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
50114e2b4712SSatish Balay   PetscFunctionReturn(0);
50124e2b4712SSatish Balay }
501315091d37SBarry Smith /*
501415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
501515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
501615091d37SBarry Smith */
50174a2ae208SSatish Balay #undef __FUNCT__
50184a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5019dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
502015091d37SBarry Smith {
502115091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5022690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
5023dfbe8321SBarry Smith   PetscErrorCode ierr;
5024690b6cddSBarry Smith   PetscInt       *diag = a->diag;
502515091d37SBarry Smith   MatScalar      *aa=a->a;
502687828ca2SBarry Smith   PetscScalar    *x,*b;
502787828ca2SBarry Smith   PetscScalar    s1,x1;
502815091d37SBarry Smith   MatScalar      *v;
5029690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
503015091d37SBarry Smith 
503115091d37SBarry Smith   PetscFunctionBegin;
50321ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
50331ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
503415091d37SBarry Smith 
503515091d37SBarry Smith   /* forward solve the lower triangular */
503615091d37SBarry Smith   idx    = 0;
503715091d37SBarry Smith   x[0]   = b[0];
503815091d37SBarry Smith   for (i=1; i<n; i++) {
503915091d37SBarry Smith     v     =  aa      + ai[i];
504015091d37SBarry Smith     vi    =  aj      + ai[i];
504115091d37SBarry Smith     nz    =  diag[i] - ai[i];
504215091d37SBarry Smith     idx   +=  1;
5043f1af5d2fSBarry Smith     s1  =  b[idx];
504415091d37SBarry Smith     while (nz--) {
504515091d37SBarry Smith       jdx   = *vi++;
504615091d37SBarry Smith       x1    = x[jdx];
5047f1af5d2fSBarry Smith       s1 -= v[0]*x1;
504815091d37SBarry Smith       v    += 1;
504915091d37SBarry Smith     }
5050f1af5d2fSBarry Smith     x[idx]   = s1;
505115091d37SBarry Smith   }
505215091d37SBarry Smith   /* backward solve the upper triangular */
505315091d37SBarry Smith   for (i=n-1; i>=0; i--){
505415091d37SBarry Smith     v    = aa + diag[i] + 1;
505515091d37SBarry Smith     vi   = aj + diag[i] + 1;
505615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
505715091d37SBarry Smith     idt  = i;
5058f1af5d2fSBarry Smith     s1 = x[idt];
505915091d37SBarry Smith     while (nz--) {
506015091d37SBarry Smith       idx   = *vi++;
506115091d37SBarry Smith       x1    = x[idx];
5062f1af5d2fSBarry Smith       s1 -= v[0]*x1;
506315091d37SBarry Smith       v    += 1;
506415091d37SBarry Smith     }
506515091d37SBarry Smith     v        = aa +  diag[i];
5066f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
506715091d37SBarry Smith   }
50681ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50691ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5070dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
507115091d37SBarry Smith   PetscFunctionReturn(0);
507215091d37SBarry Smith }
50734e2b4712SSatish Balay 
50744e2b4712SSatish Balay /* ----------------------------------------------------------------*/
507516a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
50766bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
5077ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
50786bce7ff8SHong Zhang 
50796bce7ff8SHong Zhang #undef __FUNCT__
50806bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
50816bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
50826bce7ff8SHong Zhang {
50836bce7ff8SHong Zhang   Mat            C=B;
50846bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
50856bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
50866bce7ff8SHong Zhang   PetscErrorCode ierr;
50876bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
50886bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
50896bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5090b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5091914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5092914a18a2SHong Zhang   MatScalar      *v_work;
5093ae3d28f0SHong Zhang   PetscTruth     col_identity,row_identity,both_identity;
50946bce7ff8SHong Zhang 
50956bce7ff8SHong Zhang   PetscFunctionBegin;
50966bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
50976bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5098ae3d28f0SHong Zhang 
5099fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5100fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
51016bce7ff8SHong Zhang   ics  = ic;
51026bce7ff8SHong Zhang 
5103914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5104fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5105914a18a2SHong Zhang 
51066bce7ff8SHong Zhang   for (i=0; i<n; i++){
51076bce7ff8SHong Zhang     /* zero rtmp */
51086bce7ff8SHong Zhang     /* L part */
51096bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
51106bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5111914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5112914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5113914a18a2SHong Zhang     }
51146bce7ff8SHong Zhang 
51156bce7ff8SHong Zhang     /* U part */
51161a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
51171a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
51181a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
51191a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51201a83e813SShri Abhyankar     }
51211a83e813SShri Abhyankar 
51221a83e813SShri Abhyankar     /* load in initial (unfactored row) */
51231a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
51241a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
51251a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
51261a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
51271a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
51281a83e813SShri Abhyankar     }
51291a83e813SShri Abhyankar 
51301a83e813SShri Abhyankar     /* elimination */
51311a83e813SShri Abhyankar     bjtmp = bj + bi[i];
51321a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
51331a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
51341a83e813SShri Abhyankar       row = bjtmp[k];
51351a83e813SShri Abhyankar       pc = rtmp + bs2*row;
51361a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
51371a83e813SShri Abhyankar       if (flg) {
51381a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
51391a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
51401a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
51411a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
51421a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
51431a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
51441a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
51451a83e813SShri Abhyankar         }
51461a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
51471a83e813SShri Abhyankar       }
51481a83e813SShri Abhyankar     }
51491a83e813SShri Abhyankar 
51501a83e813SShri Abhyankar     /* finished row so stick it into b->a */
51511a83e813SShri Abhyankar     /* L part */
51521a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
51531a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
51541a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
51551a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
51561a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51571a83e813SShri Abhyankar     }
51581a83e813SShri Abhyankar 
51591a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
51601a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
51611a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
51621a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
51631a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51641a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
51651a83e813SShri Abhyankar 
51661a83e813SShri Abhyankar     /* U part */
51671a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
51681a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
51691a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
51701a83e813SShri Abhyankar     for (j=0; j<nz; j++){
51711a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51721a83e813SShri Abhyankar     }
51731a83e813SShri Abhyankar   }
51741a83e813SShri Abhyankar 
51751a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5176fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
51771a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
51781a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
51791a83e813SShri Abhyankar 
5180ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5181ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5182ae3d28f0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
5183ae3d28f0SHong Zhang   if (both_identity){
5184*a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5185ae3d28f0SHong Zhang   } else {
5186*a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
5187ae3d28f0SHong Zhang   }
5188ae3d28f0SHong Zhang 
51891a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
51901a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
51911a83e813SShri Abhyankar   PetscFunctionReturn(0);
51921a83e813SShri Abhyankar }
51931a83e813SShri Abhyankar 
51946bce7ff8SHong Zhang /*
51956bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
519616a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
519716a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
51986bce7ff8SHong Zhang */
5199c0c7eb62SShri Abhyankar 
52006bce7ff8SHong Zhang #undef __FUNCT__
52016bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
52026bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
52036bce7ff8SHong Zhang {
52046bce7ff8SHong Zhang 
52056bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
52066bce7ff8SHong Zhang   PetscErrorCode     ierr;
520716a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
520835aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
520935aa4fcfSShri Abhyankar 
521035aa4fcfSShri Abhyankar   PetscFunctionBegin;
521135aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
521235aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
521335aa4fcfSShri Abhyankar 
521435aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
521535aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
521635aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
521735aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
521835aa4fcfSShri Abhyankar   if (!b->diag){
521935aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
522035aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
522135aa4fcfSShri Abhyankar   }
522235aa4fcfSShri Abhyankar   bdiag = b->diag;
522335aa4fcfSShri Abhyankar 
522435aa4fcfSShri Abhyankar   if (n > 0) {
522535aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
522635aa4fcfSShri Abhyankar   }
522735aa4fcfSShri Abhyankar 
522835aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
522935aa4fcfSShri Abhyankar   bi = b->i;
523035aa4fcfSShri Abhyankar   bj = b->j;
523135aa4fcfSShri Abhyankar 
523235aa4fcfSShri Abhyankar   /* L part */
523335aa4fcfSShri Abhyankar   bi[0] = 0;
523435aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
523535aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
523635aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
523735aa4fcfSShri Abhyankar     aj = a->j + ai[i];
523835aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
523935aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
524035aa4fcfSShri Abhyankar     }
524135aa4fcfSShri Abhyankar   }
524235aa4fcfSShri Abhyankar 
524335aa4fcfSShri Abhyankar   /* U part */
524435aa4fcfSShri Abhyankar   bi_temp = bi[n];
524535aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
524635aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
524735aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
524835aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
524935aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
525035aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
525135aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
525235aa4fcfSShri Abhyankar     }
525335aa4fcfSShri Abhyankar     /* diag[i] */
525435aa4fcfSShri Abhyankar     *bj = i; bj++;
525535aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
525635aa4fcfSShri Abhyankar   }
525735aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
525835aa4fcfSShri Abhyankar }
525935aa4fcfSShri Abhyankar 
526035aa4fcfSShri Abhyankar #undef __FUNCT__
526116a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
526216a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
526316a2bf60SHong Zhang {
526416a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
526516a2bf60SHong Zhang   IS                 isicol;
526616a2bf60SHong Zhang   PetscErrorCode     ierr;
526716a2bf60SHong Zhang   const PetscInt     *r,*ic;
52687fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
526916a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
527016a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
527116a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
52727fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
527316a2bf60SHong Zhang   PetscReal          f;
527416a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
527516a2bf60SHong Zhang   PetscBT            lnkbt;
527616a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
527716a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
527816a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
527916a2bf60SHong Zhang   PetscTruth         missing;
52807fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
528116a2bf60SHong Zhang 
528216a2bf60SHong Zhang   PetscFunctionBegin;
528316a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
528416a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
528516a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
528616a2bf60SHong Zhang 
528716a2bf60SHong Zhang   f             = info->fill;
528816a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
528916a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
529016a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
529116a2bf60SHong Zhang 
529216a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
529316a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
52947fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
529516a2bf60SHong Zhang 
52967fa3a6a0SHong Zhang   if (!levels && both_identity) {
529716a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
529816a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5299ae3d28f0SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
530035aa4fcfSShri Abhyankar 
530135aa4fcfSShri Abhyankar     fact->factor = MAT_FACTOR_ILU;
530235aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
530335aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
530435aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
530535aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
530635aa4fcfSShri Abhyankar     b->row           = isrow;
530735aa4fcfSShri Abhyankar     b->col           = iscol;
530835aa4fcfSShri Abhyankar     b->icol          = isicol;
530935aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
531035aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
531135aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
531235aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
531335aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
531435aa4fcfSShri Abhyankar   }
531535aa4fcfSShri Abhyankar 
531635aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
531735aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
531835aa4fcfSShri Abhyankar 
531935aa4fcfSShri Abhyankar   /* get new row pointers */
532035aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
532135aa4fcfSShri Abhyankar   bi[0] = 0;
532235aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
532335aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
532435aa4fcfSShri Abhyankar   bdiag[0]  = 0;
532535aa4fcfSShri Abhyankar 
5326fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
532735aa4fcfSShri Abhyankar 
532835aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
532935aa4fcfSShri Abhyankar   nlnk = n + 1;
533035aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
533135aa4fcfSShri Abhyankar 
533235aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
533335aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
533435aa4fcfSShri Abhyankar   current_space = free_space;
533535aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
533635aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
533735aa4fcfSShri Abhyankar 
533835aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
533935aa4fcfSShri Abhyankar     nzi = 0;
534035aa4fcfSShri Abhyankar     /* copy current row into linked list */
534135aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
534235aa4fcfSShri Abhyankar     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
534335aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
534435aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
534535aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
534635aa4fcfSShri Abhyankar     nzi += nlnk;
534735aa4fcfSShri Abhyankar 
534835aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
534935aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
535035aa4fcfSShri Abhyankar       fm = n;
535135aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
535235aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
535335aa4fcfSShri Abhyankar       lnk[fm]    = i;
535435aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
535535aa4fcfSShri Abhyankar       nzi++; dcount++;
535635aa4fcfSShri Abhyankar     }
535735aa4fcfSShri Abhyankar 
535835aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
535935aa4fcfSShri Abhyankar     nzbd = 0;
536035aa4fcfSShri Abhyankar     prow = lnk[n];
536135aa4fcfSShri Abhyankar     while (prow < i) {
536235aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
536335aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
536435aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
536535aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
536635aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
536735aa4fcfSShri Abhyankar       nzi += nlnk;
536835aa4fcfSShri Abhyankar       prow = lnk[prow];
536935aa4fcfSShri Abhyankar       nzbd++;
537035aa4fcfSShri Abhyankar     }
537135aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
537235aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
537335aa4fcfSShri Abhyankar 
537435aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
537535aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
537635aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
537735aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
537835aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
537935aa4fcfSShri Abhyankar       reallocs++;
538035aa4fcfSShri Abhyankar     }
538135aa4fcfSShri Abhyankar 
538235aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
538335aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
538435aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
538535aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
538635aa4fcfSShri Abhyankar 
538735aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
538835aa4fcfSShri Abhyankar     if (*(bj_ptr[i]+bdiag[i]) != i) {
538935aa4fcfSShri Abhyankar       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
539035aa4fcfSShri Abhyankar     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
539135aa4fcfSShri Abhyankar     }
539235aa4fcfSShri Abhyankar 
539335aa4fcfSShri Abhyankar     current_space->array           += nzi;
539435aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
539535aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
539635aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
539735aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
539835aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
539935aa4fcfSShri Abhyankar   }
540035aa4fcfSShri Abhyankar 
540135aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
540235aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
540335aa4fcfSShri Abhyankar 
540435aa4fcfSShri Abhyankar   /* destroy list of free space and other temporary arrays */
540535aa4fcfSShri Abhyankar   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
540635aa4fcfSShri Abhyankar 
540735aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
540835aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
540935aa4fcfSShri Abhyankar 
541035aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
541135aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5412fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
541335aa4fcfSShri Abhyankar 
541435aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
541535aa4fcfSShri Abhyankar   {
541635aa4fcfSShri Abhyankar     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
541735aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
541835aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
541935aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
542035aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
542135aa4fcfSShri Abhyankar     if (diagonal_fill) {
542235aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
542335aa4fcfSShri Abhyankar     }
542435aa4fcfSShri Abhyankar   }
542535aa4fcfSShri Abhyankar #endif
542635aa4fcfSShri Abhyankar 
542735aa4fcfSShri Abhyankar   /* put together the new matrix */
542835aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
542935aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
543035aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
543135aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
543235aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
543335aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
543435aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
543535aa4fcfSShri Abhyankar   b->j          = bj;
543635aa4fcfSShri Abhyankar   b->i          = bi;
543735aa4fcfSShri Abhyankar   b->diag       = bdiag;
543835aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
543935aa4fcfSShri Abhyankar   b->ilen       = 0;
544035aa4fcfSShri Abhyankar   b->imax       = 0;
544135aa4fcfSShri Abhyankar   b->row        = isrow;
544235aa4fcfSShri Abhyankar   b->col        = iscol;
544335aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
544435aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
544535aa4fcfSShri Abhyankar   b->icol       = isicol;
544635aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
544735aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
544835aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
544935aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
545035aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
5451ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
5452ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
5453ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5454ae3d28f0SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
545535aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
545635aa4fcfSShri Abhyankar }
545735aa4fcfSShri Abhyankar 
545835aa4fcfSShri Abhyankar 
54594e2b4712SSatish Balay /*
54604e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
54614e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
54624e2b4712SSatish Balay    Not a good example of code reuse.
54634e2b4712SSatish Balay */
54644a2ae208SSatish Balay #undef __FUNCT__
54654a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
54660481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
54674e2b4712SSatish Balay {
54684e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
54694e2b4712SSatish Balay   IS             isicol;
54706849ba73SBarry Smith   PetscErrorCode ierr;
54715d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
54725d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5473a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5474d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
547541df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
5476329f5518SBarry Smith   PetscReal      f;
5477c0c7eb62SShri Abhyankar   PetscTruth     newdatastruct = PETSC_FALSE;
54784e2b4712SSatish Balay 
54794e2b4712SSatish Balay   PetscFunctionBegin;
548016a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
548116a2bf60SHong Zhang   if (newdatastruct){
548216a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
548316a2bf60SHong Zhang     PetscFunctionReturn(0);
548416a2bf60SHong Zhang   }
548516a2bf60SHong Zhang 
54866bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
54876bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
54886bce7ff8SHong Zhang 
5489435faa5fSBarry Smith   f             = info->fill;
5490690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
5491690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
54924c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
549316a2bf60SHong Zhang 
5494667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5495667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
54967d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
5497309c388cSBarry Smith 
549841df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
549916a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
55006bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
55016bce7ff8SHong Zhang 
5502719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
5503ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
5504bb3d539aSBarry Smith     b->row       = isrow;
5505bb3d539aSBarry Smith     b->col       = iscol;
5506bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5507bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5508bb3d539aSBarry Smith     b->icol      = isicol;
5509bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5510b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
55116bce7ff8SHong Zhang     PetscFunctionReturn(0);
55126bce7ff8SHong Zhang   }
55136bce7ff8SHong Zhang 
55146bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
55154e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
55164e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
55174e2b4712SSatish Balay 
55184e2b4712SSatish Balay     /* get new row pointers */
5519690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
55204e2b4712SSatish Balay     ainew[0] = 0;
55214e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
5522690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
5523690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
55244e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
5525690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
55264e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
5527690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
55284e2b4712SSatish Balay     /* im is level for each filled value */
5529690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
55304e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
5531690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
55324e2b4712SSatish Balay     dloc[0]  = 0;
55334e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
5534435faa5fSBarry Smith 
5535435faa5fSBarry Smith       /* copy prow into linked list */
55364e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
55373b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
55384e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
55394e2b4712SSatish Balay       fill[n]    = n;
5540435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
55414e2b4712SSatish Balay       while (nz--) {
55424e2b4712SSatish Balay 	fm  = n;
55434e2b4712SSatish Balay 	idx = ic[*xi++];
55444e2b4712SSatish Balay 	do {
55454e2b4712SSatish Balay 	  m  = fm;
55464e2b4712SSatish Balay 	  fm = fill[m];
55474e2b4712SSatish Balay 	} while (fm < idx);
55484e2b4712SSatish Balay 	fill[m]   = idx;
55494e2b4712SSatish Balay 	fill[idx] = fm;
55504e2b4712SSatish Balay 	im[idx]   = 0;
55514e2b4712SSatish Balay       }
5552435faa5fSBarry Smith 
5553435faa5fSBarry Smith       /* make sure diagonal entry is included */
5554435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
5555435faa5fSBarry Smith 	fm = n;
5556435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
5557435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5558435faa5fSBarry Smith 	fill[fm]   = prow;
5559435faa5fSBarry Smith 	im[prow]   = 0;
5560435faa5fSBarry Smith 	nzf++;
5561335d9088SBarry Smith 	dcount++;
5562435faa5fSBarry Smith       }
5563435faa5fSBarry Smith 
55644e2b4712SSatish Balay       nzi = 0;
55654e2b4712SSatish Balay       row = fill[n];
55664e2b4712SSatish Balay       while (row < prow) {
55674e2b4712SSatish Balay 	incrlev = im[row] + 1;
55684e2b4712SSatish Balay 	nz      = dloc[row];
5569435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
55704e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
55714e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
55724e2b4712SSatish Balay 	fm      = row;
55734e2b4712SSatish Balay 	while (nnz-- > 0) {
55744e2b4712SSatish Balay 	  idx = *xi++;
55754e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
55764e2b4712SSatish Balay 	    flev++;
55774e2b4712SSatish Balay 	    continue;
55784e2b4712SSatish Balay 	  }
55794e2b4712SSatish Balay 	  do {
55804e2b4712SSatish Balay 	    m  = fm;
55814e2b4712SSatish Balay 	    fm = fill[m];
55824e2b4712SSatish Balay 	  } while (fm < idx);
55834e2b4712SSatish Balay 	  if (fm != idx) {
55844e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
55854e2b4712SSatish Balay 	    fill[m]   = idx;
55864e2b4712SSatish Balay 	    fill[idx] = fm;
55874e2b4712SSatish Balay 	    fm        = idx;
55884e2b4712SSatish Balay 	    nzf++;
5589ecf371e4SBarry Smith 	  } else {
55904e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
55914e2b4712SSatish Balay 	  }
55924e2b4712SSatish Balay 	  flev++;
55934e2b4712SSatish Balay 	}
55944e2b4712SSatish Balay 	row = fill[row];
55954e2b4712SSatish Balay 	nzi++;
55964e2b4712SSatish Balay       }
55974e2b4712SSatish Balay       /* copy new filled row into permanent storage */
55984e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
55994e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
5600ecf371e4SBarry Smith 
5601ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
5602ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5603ecf371e4SBarry Smith 	/* just double the memory each time */
5604690b6cddSBarry Smith 	PetscInt maxadd = jmax;
5605ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
56064e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
56074e2b4712SSatish Balay 	jmax += maxadd;
5608ecf371e4SBarry Smith 
5609ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
56105d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
56115d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5612606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
56135d0c19d7SBarry Smith 	ajnew = xitmp;
56145d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
56155d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5616606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
56175d0c19d7SBarry Smith 	ajfill = xitmp;
5618eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
56194e2b4712SSatish Balay       }
56205d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
56214e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
56224e2b4712SSatish Balay       dloc[prow]  = nzi;
56234e2b4712SSatish Balay       fm          = fill[n];
56244e2b4712SSatish Balay       while (nzf--) {
56255d0c19d7SBarry Smith 	*xitmp++ = fm;
56264e2b4712SSatish Balay 	*flev++ = im[fm];
56274e2b4712SSatish Balay 	fm      = fill[fm];
56284e2b4712SSatish Balay       }
5629435faa5fSBarry Smith       /* make sure row has diagonal entry */
5630435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
563177431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
56322401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5633435faa5fSBarry Smith       }
56344e2b4712SSatish Balay     }
5635606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
56364e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
56374e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5638606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
5639606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
56404e2b4712SSatish Balay 
56416cf91177SBarry Smith #if defined(PETSC_USE_INFO)
56424e2b4712SSatish Balay     {
5643329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5644ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5645ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5646ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5647ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5648335d9088SBarry Smith       if (diagonal_fill) {
5649ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5650335d9088SBarry Smith       }
56514e2b4712SSatish Balay     }
565263ba0a88SBarry Smith #endif
56534e2b4712SSatish Balay 
56544e2b4712SSatish Balay     /* put together the new matrix */
5655719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5656719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5657ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
5658e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
5659e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
56607c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
5661a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
56624e2b4712SSatish Balay     b->j          = ajnew;
56634e2b4712SSatish Balay     b->i          = ainew;
56644e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
56654e2b4712SSatish Balay     b->diag       = dloc;
56667f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
56674e2b4712SSatish Balay     b->ilen       = 0;
56684e2b4712SSatish Balay     b->imax       = 0;
56694e2b4712SSatish Balay     b->row        = isrow;
56704e2b4712SSatish Balay     b->col        = iscol;
5671bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5672c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5673c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5674e51c0b9cSSatish Balay     b->icol       = isicol;
567587828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
56764e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
56774e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
5678719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
56794e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
56804e2b4712SSatish Balay 
5681ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
5682ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
5683ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
56846bce7ff8SHong Zhang 
568541df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
56868661488fSKris Buschelman   PetscFunctionReturn(0);
56878661488fSKris Buschelman }
56888661488fSKris Buschelman 
5689732ee342SKris Buschelman #undef __FUNCT__
56907e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5691dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
56927e7071cdSKris Buschelman {
569312272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
569412272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
56955a9542e3SKris Buschelman   PetscFunctionBegin;
56967cf1b8d3SKris Buschelman   /* Undo Column scaling */
56977cf1b8d3SKris Buschelman /*    while (nz--) { */
56987cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
56997cf1b8d3SKris Buschelman /*    } */
5700c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
5701c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
57027cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
57037cf1b8d3SKris Buschelman }
57047cf1b8d3SKris Buschelman 
57057cf1b8d3SKris Buschelman #undef __FUNCT__
57067cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5707dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
57087cf1b8d3SKris Buschelman {
57097cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5710b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
57112aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
57125a9542e3SKris Buschelman   PetscFunctionBegin;
57130b9da03eSKris Buschelman   /* Is this really necessary? */
571420235379SKris Buschelman   while (nz--) {
57150b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
57167e7071cdSKris Buschelman   }
5717c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
57187e7071cdSKris Buschelman   PetscFunctionReturn(0);
57197e7071cdSKris Buschelman }
57207e7071cdSKris Buschelman 
5721732ee342SKris Buschelman 
5722