xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 7fa3a6a09039016628d7d5d5fddb58a44fcdf355)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120f1af5d2fSBarry Smith {
121f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122dfbe8321SBarry Smith   PetscErrorCode ierr;
123690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
125f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12787828ca2SBarry Smith   PetscScalar    *x,*b;
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   PetscFunctionBegin;
130ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith   /* forward solve the U^T */
135f1af5d2fSBarry Smith   idx = 0;
136f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
139f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
140ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144f1af5d2fSBarry Smith     v += 9;
145f1af5d2fSBarry Smith 
146f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
147f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
148f1af5d2fSBarry Smith     while (nz--) {
149f1af5d2fSBarry Smith       oidx = 3*(*vi++);
150f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153f1af5d2fSBarry Smith       v  += 9;
154f1af5d2fSBarry Smith     }
155f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156f1af5d2fSBarry Smith     idx += 3;
157f1af5d2fSBarry Smith   }
158f1af5d2fSBarry Smith   /* backward solve the L^T */
159f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
160f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
161f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
162f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
163f1af5d2fSBarry Smith     idt  = 3*i;
164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165f1af5d2fSBarry Smith     while (nz--) {
166f1af5d2fSBarry Smith       idx   = 3*(*vi--);
167f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170f1af5d2fSBarry Smith       v -= 9;
171f1af5d2fSBarry Smith     }
172f1af5d2fSBarry Smith   }
1731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176f1af5d2fSBarry Smith   PetscFunctionReturn(0);
177f1af5d2fSBarry Smith }
178f1af5d2fSBarry Smith 
1794a2ae208SSatish Balay #undef __FUNCT__
1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182f1af5d2fSBarry Smith {
183f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184dfbe8321SBarry Smith   PetscErrorCode ierr;
185690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
187f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18987828ca2SBarry Smith   PetscScalar    *x,*b;
190f1af5d2fSBarry Smith 
191f1af5d2fSBarry Smith   PetscFunctionBegin;
192ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith   /* forward solve the U^T */
197f1af5d2fSBarry Smith   idx = 0;
198f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
199f1af5d2fSBarry Smith 
200f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
201f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
202ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207f1af5d2fSBarry Smith     v += 16;
208f1af5d2fSBarry Smith 
209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
211f1af5d2fSBarry Smith     while (nz--) {
212f1af5d2fSBarry Smith       oidx = 4*(*vi++);
213f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217f1af5d2fSBarry Smith       v  += 16;
218f1af5d2fSBarry Smith     }
219f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220f1af5d2fSBarry Smith     idx += 4;
221f1af5d2fSBarry Smith   }
222f1af5d2fSBarry Smith   /* backward solve the L^T */
223f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
224f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
225f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
226f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
227f1af5d2fSBarry Smith     idt  = 4*i;
228f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229f1af5d2fSBarry Smith     while (nz--) {
230f1af5d2fSBarry Smith       idx   = 4*(*vi--);
231f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235f1af5d2fSBarry Smith       v -= 16;
236f1af5d2fSBarry Smith     }
237f1af5d2fSBarry Smith   }
2381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241f1af5d2fSBarry Smith   PetscFunctionReturn(0);
242f1af5d2fSBarry Smith }
243f1af5d2fSBarry Smith 
2444a2ae208SSatish Balay #undef __FUNCT__
2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247f1af5d2fSBarry Smith {
248f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249dfbe8321SBarry Smith   PetscErrorCode ierr;
250690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
252f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25487828ca2SBarry Smith   PetscScalar    *x,*b;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith   PetscFunctionBegin;
257ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2581ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2591ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith   /* forward solve the U^T */
262f1af5d2fSBarry Smith   idx = 0;
263f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
266f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
267ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273f1af5d2fSBarry Smith     v += 25;
274f1af5d2fSBarry Smith 
275f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
276f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
277f1af5d2fSBarry Smith     while (nz--) {
278f1af5d2fSBarry Smith       oidx = 5*(*vi++);
279f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284f1af5d2fSBarry Smith       v  += 25;
285f1af5d2fSBarry Smith     }
286f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287f1af5d2fSBarry Smith     idx += 5;
288f1af5d2fSBarry Smith   }
289f1af5d2fSBarry Smith   /* backward solve the L^T */
290f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
291f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
292f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
293f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
294f1af5d2fSBarry Smith     idt  = 5*i;
295f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296f1af5d2fSBarry Smith     while (nz--) {
297f1af5d2fSBarry Smith       idx   = 5*(*vi--);
298f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303f1af5d2fSBarry Smith       v -= 25;
304f1af5d2fSBarry Smith     }
305f1af5d2fSBarry Smith   }
3061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309f1af5d2fSBarry Smith   PetscFunctionReturn(0);
310f1af5d2fSBarry Smith }
311f1af5d2fSBarry Smith 
3124a2ae208SSatish Balay #undef __FUNCT__
3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315f1af5d2fSBarry Smith {
316f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317dfbe8321SBarry Smith   PetscErrorCode ierr;
318690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
320f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
32187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32287828ca2SBarry Smith   PetscScalar    *x,*b;
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith   PetscFunctionBegin;
325ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith   /* forward solve the U^T */
330f1af5d2fSBarry Smith   idx = 0;
331f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
332f1af5d2fSBarry Smith 
333f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
334f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
335ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336ef66eb69SBarry Smith     x6    = x[5+idx];
337f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343f1af5d2fSBarry Smith     v += 36;
344f1af5d2fSBarry Smith 
345f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
346f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
347f1af5d2fSBarry Smith     while (nz--) {
348f1af5d2fSBarry Smith       oidx = 6*(*vi++);
349f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355f1af5d2fSBarry Smith       v  += 36;
356f1af5d2fSBarry Smith     }
357f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358f1af5d2fSBarry Smith     x[5+idx] = s6;
359f1af5d2fSBarry Smith     idx += 6;
360f1af5d2fSBarry Smith   }
361f1af5d2fSBarry Smith   /* backward solve the L^T */
362f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
363f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
364f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
365f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
366f1af5d2fSBarry Smith     idt  = 6*i;
367f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368f1af5d2fSBarry Smith     s6 = x[5+idt];
369f1af5d2fSBarry Smith     while (nz--) {
370f1af5d2fSBarry Smith       idx   = 6*(*vi--);
371f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377f1af5d2fSBarry Smith       v -= 36;
378f1af5d2fSBarry Smith     }
379f1af5d2fSBarry Smith   }
3801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383f1af5d2fSBarry Smith   PetscFunctionReturn(0);
384f1af5d2fSBarry Smith }
385f1af5d2fSBarry Smith 
3864a2ae208SSatish Balay #undef __FUNCT__
3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389f1af5d2fSBarry Smith {
390f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391dfbe8321SBarry Smith   PetscErrorCode ierr;
392690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
394f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39687828ca2SBarry Smith   PetscScalar    *x,*b;
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith   PetscFunctionBegin;
399ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith   /* forward solve the U^T */
404f1af5d2fSBarry Smith   idx = 0;
405f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
406f1af5d2fSBarry Smith 
407f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
408f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
409ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
411f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418f1af5d2fSBarry Smith     v += 49;
419f1af5d2fSBarry Smith 
420f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
421f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
422f1af5d2fSBarry Smith     while (nz--) {
423f1af5d2fSBarry Smith       oidx = 7*(*vi++);
424f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431f1af5d2fSBarry Smith       v  += 49;
432f1af5d2fSBarry Smith     }
433f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
435f1af5d2fSBarry Smith     idx += 7;
436f1af5d2fSBarry Smith   }
437f1af5d2fSBarry Smith   /* backward solve the L^T */
438f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
439f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
440f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
441f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
442f1af5d2fSBarry Smith     idt  = 7*i;
443f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
445f1af5d2fSBarry Smith     while (nz--) {
446f1af5d2fSBarry Smith       idx   = 7*(*vi--);
447f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454f1af5d2fSBarry Smith       v -= 49;
455f1af5d2fSBarry Smith     }
456f1af5d2fSBarry Smith   }
4571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460f1af5d2fSBarry Smith   PetscFunctionReturn(0);
461f1af5d2fSBarry Smith }
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4644a2ae208SSatish Balay #undef __FUNCT__
4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467f1af5d2fSBarry Smith {
468f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
4715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473690b6cddSBarry Smith   PetscInt       *diag = a->diag;
474f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47587828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   PetscFunctionBegin;
4781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480f1af5d2fSBarry Smith   t  = a->solve_work;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith     t[i] = b[c[i]];
488f1af5d2fSBarry Smith   }
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith   /* forward solve the U^T */
491f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith     v     = aa + diag[i];
494f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
495f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith     t[i]   = s1;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     s1   = t[i];
509f1af5d2fSBarry Smith     while (nz--) {
510f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   /* copy t into x according to permutation */
515f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
516f1af5d2fSBarry Smith     x[r[i]]   = t[i];
517f1af5d2fSBarry Smith   }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5211ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   PetscFunctionReturn(0);
525f1af5d2fSBarry Smith }
526f1af5d2fSBarry Smith 
5274a2ae208SSatish Balay #undef __FUNCT__
5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530f1af5d2fSBarry Smith {
531f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5336849ba73SBarry Smith   PetscErrorCode ierr;
5345d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53887828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   PetscFunctionBegin;
5421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544f1af5d2fSBarry Smith   t  = a->solve_work;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
550f1af5d2fSBarry Smith   ii = 0;
551f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
552f1af5d2fSBarry Smith     ic      = 2*c[i];
553f1af5d2fSBarry Smith     t[ii]   = b[ic];
554f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
555f1af5d2fSBarry Smith     ii += 2;
556f1af5d2fSBarry Smith   }
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith   /* forward solve the U^T */
559f1af5d2fSBarry Smith   idx = 0;
560f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
561f1af5d2fSBarry Smith 
562f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
563f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
564f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
565f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
566f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
567f1af5d2fSBarry Smith     v += 4;
568f1af5d2fSBarry Smith 
569f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
570f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
571f1af5d2fSBarry Smith     while (nz--) {
572f1af5d2fSBarry Smith       oidx = 2*(*vi++);
573f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575f1af5d2fSBarry Smith       v  += 4;
576f1af5d2fSBarry Smith     }
577f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
578f1af5d2fSBarry Smith     idx += 2;
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith   /* backward solve the L^T */
581f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
582f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
583f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
584f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
585f1af5d2fSBarry Smith     idt  = 2*i;
586f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
587f1af5d2fSBarry Smith     while (nz--) {
588f1af5d2fSBarry Smith       idx   = 2*(*vi--);
589f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591f1af5d2fSBarry Smith       v -= 4;
592f1af5d2fSBarry Smith     }
593f1af5d2fSBarry Smith   }
594f1af5d2fSBarry Smith 
595f1af5d2fSBarry Smith   /* copy t into x according to permutation */
596f1af5d2fSBarry Smith   ii = 0;
597f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
598f1af5d2fSBarry Smith     ir      = 2*r[i];
599f1af5d2fSBarry Smith     x[ir]   = t[ii];
600f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
601f1af5d2fSBarry Smith     ii += 2;
602f1af5d2fSBarry Smith   }
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609f1af5d2fSBarry Smith   PetscFunctionReturn(0);
610f1af5d2fSBarry Smith }
611f1af5d2fSBarry Smith 
6124a2ae208SSatish Balay #undef __FUNCT__
6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615f1af5d2fSBarry Smith {
616f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6186849ba73SBarry Smith   PetscErrorCode ierr;
6195d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   PetscFunctionBegin;
6271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629f1af5d2fSBarry Smith   t  = a->solve_work;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
635f1af5d2fSBarry Smith   ii = 0;
636f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
637f1af5d2fSBarry Smith     ic      = 3*c[i];
638f1af5d2fSBarry Smith     t[ii]   = b[ic];
639f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
640f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
641f1af5d2fSBarry Smith     ii += 3;
642f1af5d2fSBarry Smith   }
643f1af5d2fSBarry Smith 
644f1af5d2fSBarry Smith   /* forward solve the U^T */
645f1af5d2fSBarry Smith   idx = 0;
646f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
647f1af5d2fSBarry Smith 
648f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
649f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
650f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654f1af5d2fSBarry Smith     v += 9;
655f1af5d2fSBarry Smith 
656f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
657f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
658f1af5d2fSBarry Smith     while (nz--) {
659f1af5d2fSBarry Smith       oidx = 3*(*vi++);
660f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663f1af5d2fSBarry Smith       v  += 9;
664f1af5d2fSBarry Smith     }
665f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666f1af5d2fSBarry Smith     idx += 3;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 3*i;
674f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 3*(*vi--);
677f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680f1af5d2fSBarry Smith       v -= 9;
681f1af5d2fSBarry Smith     }
682f1af5d2fSBarry Smith   }
683f1af5d2fSBarry Smith 
684f1af5d2fSBarry Smith   /* copy t into x according to permutation */
685f1af5d2fSBarry Smith   ii = 0;
686f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
687f1af5d2fSBarry Smith     ir      = 3*r[i];
688f1af5d2fSBarry Smith     x[ir]   = t[ii];
689f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
690f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
691f1af5d2fSBarry Smith     ii += 3;
692f1af5d2fSBarry Smith   }
693f1af5d2fSBarry Smith 
694f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699f1af5d2fSBarry Smith   PetscFunctionReturn(0);
700f1af5d2fSBarry Smith }
701f1af5d2fSBarry Smith 
7024a2ae208SSatish Balay #undef __FUNCT__
7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705f1af5d2fSBarry Smith {
706f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7086849ba73SBarry Smith   PetscErrorCode ierr;
7095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   PetscFunctionBegin;
7171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719f1af5d2fSBarry Smith   t  = a->solve_work;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
725f1af5d2fSBarry Smith   ii = 0;
726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
727f1af5d2fSBarry Smith     ic      = 4*c[i];
728f1af5d2fSBarry Smith     t[ii]   = b[ic];
729f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
730f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
731f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
732f1af5d2fSBarry Smith     ii += 4;
733f1af5d2fSBarry Smith   }
734f1af5d2fSBarry Smith 
735f1af5d2fSBarry Smith   /* forward solve the U^T */
736f1af5d2fSBarry Smith   idx = 0;
737f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
738f1af5d2fSBarry Smith 
739f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
740f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
741f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746f1af5d2fSBarry Smith     v += 16;
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
749f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       oidx = 4*(*vi++);
752f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v  += 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759f1af5d2fSBarry Smith     idx += 4;
760f1af5d2fSBarry Smith   }
761f1af5d2fSBarry Smith   /* backward solve the L^T */
762f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
763f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
764f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
765f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
766f1af5d2fSBarry Smith     idt  = 4*i;
767f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768f1af5d2fSBarry Smith     while (nz--) {
769f1af5d2fSBarry Smith       idx   = 4*(*vi--);
770f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774f1af5d2fSBarry Smith       v -= 16;
775f1af5d2fSBarry Smith     }
776f1af5d2fSBarry Smith   }
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   /* copy t into x according to permutation */
779f1af5d2fSBarry Smith   ii = 0;
780f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
781f1af5d2fSBarry Smith     ir      = 4*r[i];
782f1af5d2fSBarry Smith     x[ir]   = t[ii];
783f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
784f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
785f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
786f1af5d2fSBarry Smith     ii += 4;
787f1af5d2fSBarry Smith   }
788f1af5d2fSBarry Smith 
789f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   PetscFunctionReturn(0);
795f1af5d2fSBarry Smith }
796f1af5d2fSBarry Smith 
7974a2ae208SSatish Balay #undef __FUNCT__
7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800f1af5d2fSBarry Smith {
801f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8036849ba73SBarry Smith   PetscErrorCode ierr;
8045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   PetscFunctionBegin;
8121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814f1af5d2fSBarry Smith   t  = a->solve_work;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
820f1af5d2fSBarry Smith   ii = 0;
821f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
822f1af5d2fSBarry Smith     ic      = 5*c[i];
823f1af5d2fSBarry Smith     t[ii]   = b[ic];
824f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
825f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
826f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
827f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
828f1af5d2fSBarry Smith     ii += 5;
829f1af5d2fSBarry Smith   }
830f1af5d2fSBarry Smith 
831f1af5d2fSBarry Smith   /* forward solve the U^T */
832f1af5d2fSBarry Smith   idx = 0;
833f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
834f1af5d2fSBarry Smith 
835f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
836f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
837f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843f1af5d2fSBarry Smith     v += 25;
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
846f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       oidx = 5*(*vi++);
849f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v  += 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857f1af5d2fSBarry Smith     idx += 5;
858f1af5d2fSBarry Smith   }
859f1af5d2fSBarry Smith   /* backward solve the L^T */
860f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
861f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
862f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
863f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
864f1af5d2fSBarry Smith     idt  = 5*i;
865f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866f1af5d2fSBarry Smith     while (nz--) {
867f1af5d2fSBarry Smith       idx   = 5*(*vi--);
868f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873f1af5d2fSBarry Smith       v -= 25;
874f1af5d2fSBarry Smith     }
875f1af5d2fSBarry Smith   }
876f1af5d2fSBarry Smith 
877f1af5d2fSBarry Smith   /* copy t into x according to permutation */
878f1af5d2fSBarry Smith   ii = 0;
879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
880f1af5d2fSBarry Smith     ir      = 5*r[i];
881f1af5d2fSBarry Smith     x[ir]   = t[ii];
882f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
883f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
884f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
885f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
886f1af5d2fSBarry Smith     ii += 5;
887f1af5d2fSBarry Smith   }
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894f1af5d2fSBarry Smith   PetscFunctionReturn(0);
895f1af5d2fSBarry Smith }
896f1af5d2fSBarry Smith 
8974a2ae208SSatish Balay #undef __FUNCT__
8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900f1af5d2fSBarry Smith {
901f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9036849ba73SBarry Smith   PetscErrorCode ierr;
9045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   PetscFunctionBegin;
9121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914f1af5d2fSBarry Smith   t  = a->solve_work;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
920f1af5d2fSBarry Smith   ii = 0;
921f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
922f1af5d2fSBarry Smith     ic      = 6*c[i];
923f1af5d2fSBarry Smith     t[ii]   = b[ic];
924f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
925f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
926f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
927f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
928f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
929f1af5d2fSBarry Smith     ii += 6;
930f1af5d2fSBarry Smith   }
931f1af5d2fSBarry Smith 
932f1af5d2fSBarry Smith   /* forward solve the U^T */
933f1af5d2fSBarry Smith   idx = 0;
934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
937f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
938f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939f1af5d2fSBarry Smith     x6    = t[5+idx];
940f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946f1af5d2fSBarry Smith     v += 36;
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
949f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       oidx = 6*(*vi++);
952f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v  += 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961f1af5d2fSBarry Smith     t[5+idx] = s6;
962f1af5d2fSBarry Smith     idx += 6;
963f1af5d2fSBarry Smith   }
964f1af5d2fSBarry Smith   /* backward solve the L^T */
965f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
966f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
967f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
968f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
969f1af5d2fSBarry Smith     idt  = 6*i;
970f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971f1af5d2fSBarry Smith     s6 = t[5+idt];
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       idx   = 6*(*vi--);
974f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980f1af5d2fSBarry Smith       v -= 36;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith   }
983f1af5d2fSBarry Smith 
984f1af5d2fSBarry Smith   /* copy t into x according to permutation */
985f1af5d2fSBarry Smith   ii = 0;
986f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
987f1af5d2fSBarry Smith     ir      = 6*r[i];
988f1af5d2fSBarry Smith     x[ir]   = t[ii];
989f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
990f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
991f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
992f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
993f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
994f1af5d2fSBarry Smith     ii += 6;
995f1af5d2fSBarry Smith   }
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1003f1af5d2fSBarry Smith }
1004f1af5d2fSBarry Smith 
10054a2ae208SSatish Balay #undef __FUNCT__
10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008f1af5d2fSBarry Smith {
1009f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10116849ba73SBarry Smith   PetscErrorCode ierr;
10125d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   PetscFunctionBegin;
10201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022f1af5d2fSBarry Smith   t  = a->solve_work;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1028f1af5d2fSBarry Smith   ii = 0;
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     ic      = 7*c[i];
1031f1af5d2fSBarry Smith     t[ii]   = b[ic];
1032f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1033f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1034f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1035f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1036f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1037f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1038f1af5d2fSBarry Smith     ii += 7;
1039f1af5d2fSBarry Smith   }
1040f1af5d2fSBarry Smith 
1041f1af5d2fSBarry Smith   /* forward solve the U^T */
1042f1af5d2fSBarry Smith   idx = 0;
1043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1044f1af5d2fSBarry Smith 
1045f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1046f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1047f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1049f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056f1af5d2fSBarry Smith     v += 49;
1057f1af5d2fSBarry Smith 
1058f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1059f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1062f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v  += 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1073f1af5d2fSBarry Smith     idx += 7;
1074f1af5d2fSBarry Smith   }
1075f1af5d2fSBarry Smith   /* backward solve the L^T */
1076f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1077f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1078f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1079f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1080f1af5d2fSBarry Smith     idt  = 7*i;
1081f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1083f1af5d2fSBarry Smith     while (nz--) {
1084f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1085f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092f1af5d2fSBarry Smith       v -= 49;
1093f1af5d2fSBarry Smith     }
1094f1af5d2fSBarry Smith   }
1095f1af5d2fSBarry Smith 
1096f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1097f1af5d2fSBarry Smith   ii = 0;
1098f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1099f1af5d2fSBarry Smith     ir      = 7*r[i];
1100f1af5d2fSBarry Smith     x[ir]   = t[ii];
1101f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1102f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1103f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1104f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1105f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1106f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1107f1af5d2fSBarry Smith     ii += 7;
1108f1af5d2fSBarry Smith   }
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1116f1af5d2fSBarry Smith }
1117f1af5d2fSBarry Smith 
11184e2b4712SSatish Balay /* ----------------------------------------------------------- */
11194a2ae208SSatish Balay #undef __FUNCT__
11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11224e2b4712SSatish Balay {
11234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11256849ba73SBarry Smith   PetscErrorCode ierr;
11265d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11285d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11293f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
113087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11314e2b4712SSatish Balay 
11324e2b4712SSatish Balay   PetscFunctionBegin;
11331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135f1af5d2fSBarry Smith   t  = a->solve_work;
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11394e2b4712SSatish Balay 
11404e2b4712SSatish Balay   /* forward solve the lower triangular */
114187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11424e2b4712SSatish Balay   for (i=1; i<n; i++) {
11434e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11444e2b4712SSatish Balay     vi  = aj + ai[i];
11454e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1146f1af5d2fSBarry Smith     s = t + bs*i;
114787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
11524e2b4712SSatish Balay   }
11534e2b4712SSatish Balay   /* backward solve the upper triangular */
1154d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11564e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11574e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11584e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11604e2b4712SSatish Balay     while (nz--) {
1161f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11624e2b4712SSatish Balay       v += bs2;
11634e2b4712SSatish Balay     }
1164f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11664e2b4712SSatish Balay   }
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11734e2b4712SSatish Balay   PetscFunctionReturn(0);
11744e2b4712SSatish Balay }
11754e2b4712SSatish Balay 
11764a2ae208SSatish Balay #undef __FUNCT__
11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11794e2b4712SSatish Balay {
11804e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11814e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11826849ba73SBarry Smith   PetscErrorCode ierr;
11835d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11845d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11853f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11884e2b4712SSatish Balay 
11894e2b4712SSatish Balay   PetscFunctionBegin;
11901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192f1af5d2fSBarry Smith   t  = a->solve_work;
11934e2b4712SSatish Balay 
11944e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11954e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11964e2b4712SSatish Balay 
11974e2b4712SSatish Balay   /* forward solve the lower triangular */
11984e2b4712SSatish Balay   idx    = 7*(*r++);
1199f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1200f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12024e2b4712SSatish Balay 
12034e2b4712SSatish Balay   for (i=1; i<n; i++) {
12044e2b4712SSatish Balay     v     = aa + 49*ai[i];
12054e2b4712SSatish Balay     vi    = aj + ai[i];
12064e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12074e2b4712SSatish Balay     idx   = 7*(*r++);
1208f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12104e2b4712SSatish Balay     while (nz--) {
12114e2b4712SSatish Balay       idx   = 7*(*vi++);
1212f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1214f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1215f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12224e2b4712SSatish Balay       v += 49;
12234e2b4712SSatish Balay     }
12244e2b4712SSatish Balay     idx = 7*i;
1225f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1226f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12284e2b4712SSatish Balay   }
12294e2b4712SSatish Balay   /* backward solve the upper triangular */
12304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12314e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12324e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12334e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12344e2b4712SSatish Balay     idt  = 7*i;
1235f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1236f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12384e2b4712SSatish Balay     while (nz--) {
12394e2b4712SSatish Balay       idx   = 7*(*vi++);
1240f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1241f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1243f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12504e2b4712SSatish Balay       v += 49;
12514e2b4712SSatish Balay     }
12524e2b4712SSatish Balay     idc = 7*(*c--);
12534e2b4712SSatish Balay     v   = aa + 49*diag[i];
1254f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12684e2b4712SSatish Balay   }
12694e2b4712SSatish Balay 
12704e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12714e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12721ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12754e2b4712SSatish Balay   PetscFunctionReturn(0);
12764e2b4712SSatish Balay }
12774e2b4712SSatish Balay 
12784a2ae208SSatish Balay #undef __FUNCT__
12794a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1280dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
128115091d37SBarry Smith {
128215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1283690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1284dfbe8321SBarry Smith   PetscErrorCode    ierr;
1285690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1286d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1287d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1288d9fead3dSBarry Smith   const PetscScalar *b;
128915091d37SBarry Smith 
129015091d37SBarry Smith   PetscFunctionBegin;
1291d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129315091d37SBarry Smith   /* forward solve the lower triangular */
129415091d37SBarry Smith   idx    = 0;
129515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
129615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
129715091d37SBarry Smith   x[6] = b[6+idx];
129815091d37SBarry Smith   for (i=1; i<n; i++) {
129915091d37SBarry Smith     v     =  aa + 49*ai[i];
130015091d37SBarry Smith     vi    =  aj + ai[i];
130115091d37SBarry Smith     nz    =  diag[i] - ai[i];
130215091d37SBarry Smith     idx   =  7*i;
1303f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1304f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1305f1af5d2fSBarry Smith     s7  =  b[6+idx];
130615091d37SBarry Smith     while (nz--) {
130715091d37SBarry Smith       jdx   = 7*(*vi++);
130815091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
130915091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
131015091d37SBarry Smith       x7    = x[6+jdx];
1311f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1312f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1313f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1314f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1315f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1316f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1317f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
131815091d37SBarry Smith       v += 49;
131915091d37SBarry Smith      }
1320f1af5d2fSBarry Smith     x[idx]   = s1;
1321f1af5d2fSBarry Smith     x[1+idx] = s2;
1322f1af5d2fSBarry Smith     x[2+idx] = s3;
1323f1af5d2fSBarry Smith     x[3+idx] = s4;
1324f1af5d2fSBarry Smith     x[4+idx] = s5;
1325f1af5d2fSBarry Smith     x[5+idx] = s6;
1326f1af5d2fSBarry Smith     x[6+idx] = s7;
132715091d37SBarry Smith   }
132815091d37SBarry Smith   /* backward solve the upper triangular */
132915091d37SBarry Smith   for (i=n-1; i>=0; i--){
133015091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
133115091d37SBarry Smith     vi   = aj + diag[i] + 1;
133215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
133315091d37SBarry Smith     idt  = 7*i;
1334f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1335f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1336f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1337f1af5d2fSBarry Smith     s7 = x[6+idt];
133815091d37SBarry Smith     while (nz--) {
133915091d37SBarry Smith       idx   = 7*(*vi++);
134015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
134115091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
134215091d37SBarry Smith       x7    = x[6+idx];
1343f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1344f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1345f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1346f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1347f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1348f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1349f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
135015091d37SBarry Smith       v += 49;
135115091d37SBarry Smith     }
135215091d37SBarry Smith     v        = aa + 49*diag[i];
1353f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1354f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1355f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1356f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1357f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1358f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1359f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1360f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1361f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1362f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1363f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1364f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1365f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1366f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
136715091d37SBarry Smith   }
136815091d37SBarry Smith 
1369d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1371dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
137215091d37SBarry Smith   PetscFunctionReturn(0);
137315091d37SBarry Smith }
137415091d37SBarry Smith 
13754a2ae208SSatish Balay #undef __FUNCT__
1376cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1377cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1378cee9d6f2SShri Abhyankar {
1379cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1380cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1381cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1382cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1383cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1384cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1385cee9d6f2SShri Abhyankar     PetscScalar       *x;
1386cee9d6f2SShri Abhyankar     const PetscScalar *b;
1387cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1388cee9d6f2SShri Abhyankar 
1389cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1390cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1391cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1392cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1393cee9d6f2SShri Abhyankar     idx    = 0;
1394cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1395cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1396cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1397cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1398cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1399cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1400cee9d6f2SShri Abhyankar       idx   = bs*i;
1401cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1402cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1403cee9d6f2SShri Abhyankar        while (nz--) {
1404cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
1405cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1406cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1407cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1408cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1409cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1410cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1411cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1412cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1413cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1414cee9d6f2SShri Abhyankar           v   +=  bs2;
1415cee9d6f2SShri Abhyankar         }
1416cee9d6f2SShri Abhyankar 
1417cee9d6f2SShri Abhyankar        x[idx]   = s1;
1418cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1419cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1420cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1421cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1422cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1423cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1424cee9d6f2SShri Abhyankar     }
1425cee9d6f2SShri Abhyankar 
1426cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1427cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1428cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1429cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1430cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1431cee9d6f2SShri Abhyankar      idt = bs*i;
1432cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1433cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1434cee9d6f2SShri Abhyankar     while (nz--) {
1435cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
1436cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1437cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1438cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1439cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1440cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1441cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1442cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1443cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1444cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1445cee9d6f2SShri Abhyankar         v   +=  bs2;
1446cee9d6f2SShri Abhyankar     }
1447cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1448cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1449cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1450cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1451cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1452cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1453cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1454cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1455cee9d6f2SShri Abhyankar   }
1456cee9d6f2SShri Abhyankar 
1457cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1458cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1459cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1460cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1461cee9d6f2SShri Abhyankar }
1462cee9d6f2SShri Abhyankar 
1463cee9d6f2SShri Abhyankar #undef __FUNCT__
14644a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1465dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
146615091d37SBarry Smith {
146715091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
146815091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
14696849ba73SBarry Smith   PetscErrorCode    ierr;
14705d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
14715d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1472d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1473d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1474d9fead3dSBarry Smith   const PetscScalar *b;
147515091d37SBarry Smith   PetscFunctionBegin;
1476d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1478f1af5d2fSBarry Smith   t  = a->solve_work;
147915091d37SBarry Smith 
148015091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
148115091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
148215091d37SBarry Smith 
148315091d37SBarry Smith   /* forward solve the lower triangular */
148415091d37SBarry Smith   idx    = 6*(*r++);
1485f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1486f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1487f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
148815091d37SBarry Smith   for (i=1; i<n; i++) {
148915091d37SBarry Smith     v     = aa + 36*ai[i];
149015091d37SBarry Smith     vi    = aj + ai[i];
149115091d37SBarry Smith     nz    = diag[i] - ai[i];
149215091d37SBarry Smith     idx   = 6*(*r++);
1493f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1494f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
149515091d37SBarry Smith     while (nz--) {
149615091d37SBarry Smith       idx   = 6*(*vi++);
1497f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1498f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1499f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1500f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1501f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1502f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1503f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1504f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150515091d37SBarry Smith       v += 36;
150615091d37SBarry Smith     }
150715091d37SBarry Smith     idx = 6*i;
1508f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1509f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1510f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
151115091d37SBarry Smith   }
151215091d37SBarry Smith   /* backward solve the upper triangular */
151315091d37SBarry Smith   for (i=n-1; i>=0; i--){
151415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
151515091d37SBarry Smith     vi   = aj + diag[i] + 1;
151615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
151715091d37SBarry Smith     idt  = 6*i;
1518f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1519f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1520f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
152115091d37SBarry Smith     while (nz--) {
152215091d37SBarry Smith       idx   = 6*(*vi++);
1523f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1524f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1525f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1526f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
153215091d37SBarry Smith       v += 36;
153315091d37SBarry Smith     }
153415091d37SBarry Smith     idc = 6*(*c--);
153515091d37SBarry Smith     v   = aa + 36*diag[i];
1536f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1537f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1538f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1539f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1540f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1541f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1542f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1543f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1544f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1545f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1546f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1547f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
154815091d37SBarry Smith   }
154915091d37SBarry Smith 
155015091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
155115091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1552d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1554dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
155515091d37SBarry Smith   PetscFunctionReturn(0);
155615091d37SBarry Smith }
155715091d37SBarry Smith 
15584a2ae208SSatish Balay #undef __FUNCT__
15594a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1560dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
156115091d37SBarry Smith {
156215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1563690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1564dfbe8321SBarry Smith   PetscErrorCode    ierr;
1565690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1566d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1567d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1568d9fead3dSBarry Smith   const PetscScalar *b;
156915091d37SBarry Smith 
157015091d37SBarry Smith   PetscFunctionBegin;
1571d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15721ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
157315091d37SBarry Smith   /* forward solve the lower triangular */
157415091d37SBarry Smith   idx    = 0;
157515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
157615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
157715091d37SBarry Smith   for (i=1; i<n; i++) {
157815091d37SBarry Smith     v     =  aa + 36*ai[i];
157915091d37SBarry Smith     vi    =  aj + ai[i];
158015091d37SBarry Smith     nz    =  diag[i] - ai[i];
158115091d37SBarry Smith     idx   =  6*i;
1582f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1583f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
158415091d37SBarry Smith     while (nz--) {
158515091d37SBarry Smith       jdx   = 6*(*vi++);
158615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
158715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1588f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1589f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1590f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1591f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1592f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1593f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
159415091d37SBarry Smith       v += 36;
159515091d37SBarry Smith      }
1596f1af5d2fSBarry Smith     x[idx]   = s1;
1597f1af5d2fSBarry Smith     x[1+idx] = s2;
1598f1af5d2fSBarry Smith     x[2+idx] = s3;
1599f1af5d2fSBarry Smith     x[3+idx] = s4;
1600f1af5d2fSBarry Smith     x[4+idx] = s5;
1601f1af5d2fSBarry Smith     x[5+idx] = s6;
160215091d37SBarry Smith   }
160315091d37SBarry Smith   /* backward solve the upper triangular */
160415091d37SBarry Smith   for (i=n-1; i>=0; i--){
160515091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
160615091d37SBarry Smith     vi   = aj + diag[i] + 1;
160715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
160815091d37SBarry Smith     idt  = 6*i;
1609f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1610f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1611f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
161215091d37SBarry Smith     while (nz--) {
161315091d37SBarry Smith       idx   = 6*(*vi++);
161415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
161515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1616f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1617f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1618f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1619f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1620f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1621f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
162215091d37SBarry Smith       v += 36;
162315091d37SBarry Smith     }
162415091d37SBarry Smith     v        = aa + 36*diag[i];
1625f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1626f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1627f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1628f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1629f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1630f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
163115091d37SBarry Smith   }
163215091d37SBarry Smith 
1633d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16341ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1635dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
163615091d37SBarry Smith   PetscFunctionReturn(0);
163715091d37SBarry Smith }
163815091d37SBarry Smith 
16394a2ae208SSatish Balay #undef __FUNCT__
1640cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1641cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1642cee9d6f2SShri Abhyankar {
1643cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1644cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1645cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1646cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1647cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1648cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1649cee9d6f2SShri Abhyankar     PetscScalar       *x;
1650cee9d6f2SShri Abhyankar     const PetscScalar *b;
1651cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1652cee9d6f2SShri Abhyankar 
1653cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1654cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1655cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1656cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1657cee9d6f2SShri Abhyankar     idx    = 0;
1658cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1659cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
1660cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1661cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1662cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1663cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1664cee9d6f2SShri Abhyankar       idx   = bs*i;
1665cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1666cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
1667cee9d6f2SShri Abhyankar        while (nz--) {
1668cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
1669cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1670cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1671cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1672cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1673cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1674cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1675cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1676cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1677cee9d6f2SShri Abhyankar           v   +=  bs2;
1678cee9d6f2SShri Abhyankar         }
1679cee9d6f2SShri Abhyankar 
1680cee9d6f2SShri Abhyankar        x[idx]   = s1;
1681cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1682cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1683cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1684cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1685cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1686cee9d6f2SShri Abhyankar     }
1687cee9d6f2SShri Abhyankar 
1688cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1689cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1690cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1691cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1692cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1693cee9d6f2SShri Abhyankar      idt = bs*i;
1694cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1695cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
1696cee9d6f2SShri Abhyankar     while (nz--) {
1697cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
1698cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1699cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
1700cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1701cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1702cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1703cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1704cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1705cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1706cee9d6f2SShri Abhyankar         v   +=  bs2;
1707cee9d6f2SShri Abhyankar     }
1708cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1709cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1710cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1711cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1712cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1713cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1714cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1715cee9d6f2SShri Abhyankar   }
1716cee9d6f2SShri Abhyankar 
1717cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1718cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1719cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1720cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1721cee9d6f2SShri Abhyankar }
1722cee9d6f2SShri Abhyankar #undef __FUNCT__
17234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1724dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
17254e2b4712SSatish Balay {
17264e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17274e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
17286849ba73SBarry Smith   PetscErrorCode    ierr;
17295d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
17305d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1731d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1732d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1733d9fead3dSBarry Smith   const PetscScalar *b;
17344e2b4712SSatish Balay 
17354e2b4712SSatish Balay   PetscFunctionBegin;
1736d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1738f1af5d2fSBarry Smith   t  = a->solve_work;
17394e2b4712SSatish Balay 
17404e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17414e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
17424e2b4712SSatish Balay 
17434e2b4712SSatish Balay   /* forward solve the lower triangular */
17444e2b4712SSatish Balay   idx    = 5*(*r++);
1745f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1746f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
17474e2b4712SSatish Balay   for (i=1; i<n; i++) {
17484e2b4712SSatish Balay     v     = aa + 25*ai[i];
17494e2b4712SSatish Balay     vi    = aj + ai[i];
17504e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17514e2b4712SSatish Balay     idx   = 5*(*r++);
1752f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1753f1af5d2fSBarry Smith     s5  = b[4+idx];
17544e2b4712SSatish Balay     while (nz--) {
17554e2b4712SSatish Balay       idx   = 5*(*vi++);
1756f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1757f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1758f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1759f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1760f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1761f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1762f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
17634e2b4712SSatish Balay       v += 25;
17644e2b4712SSatish Balay     }
17654e2b4712SSatish Balay     idx = 5*i;
1766f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1767f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
17684e2b4712SSatish Balay   }
17694e2b4712SSatish Balay   /* backward solve the upper triangular */
17704e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17714e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
17724e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17734e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17744e2b4712SSatish Balay     idt  = 5*i;
1775f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1776f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
17774e2b4712SSatish Balay     while (nz--) {
17784e2b4712SSatish Balay       idx   = 5*(*vi++);
1779f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1780f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1781f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1782f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1783f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1784f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1785f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
17864e2b4712SSatish Balay       v += 25;
17874e2b4712SSatish Balay     }
17884e2b4712SSatish Balay     idc = 5*(*c--);
17894e2b4712SSatish Balay     v   = aa + 25*diag[i];
1790f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1791f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1792f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1793f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1794f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1795f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1796f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1797f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1798f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1799f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
18004e2b4712SSatish Balay   }
18014e2b4712SSatish Balay 
18024e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18034e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1804d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1806dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
18074e2b4712SSatish Balay   PetscFunctionReturn(0);
18084e2b4712SSatish Balay }
18094e2b4712SSatish Balay 
18104a2ae208SSatish Balay #undef __FUNCT__
18114a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1812dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
181315091d37SBarry Smith {
181415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1815690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1816dfbe8321SBarry Smith   PetscErrorCode    ierr;
1817690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1818d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1819d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1820d9fead3dSBarry Smith   const PetscScalar *b;
182115091d37SBarry Smith 
182215091d37SBarry Smith   PetscFunctionBegin;
1823d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18241ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
182515091d37SBarry Smith   /* forward solve the lower triangular */
182615091d37SBarry Smith   idx    = 0;
182715091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
182815091d37SBarry Smith   for (i=1; i<n; i++) {
182915091d37SBarry Smith     v     =  aa + 25*ai[i];
183015091d37SBarry Smith     vi    =  aj + ai[i];
183115091d37SBarry Smith     nz    =  diag[i] - ai[i];
183215091d37SBarry Smith     idx   =  5*i;
1833f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
183415091d37SBarry Smith     while (nz--) {
183515091d37SBarry Smith       jdx   = 5*(*vi++);
183615091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1837f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1838f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1839f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1840f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1841f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
184215091d37SBarry Smith       v    += 25;
184315091d37SBarry Smith     }
1844f1af5d2fSBarry Smith     x[idx]   = s1;
1845f1af5d2fSBarry Smith     x[1+idx] = s2;
1846f1af5d2fSBarry Smith     x[2+idx] = s3;
1847f1af5d2fSBarry Smith     x[3+idx] = s4;
1848f1af5d2fSBarry Smith     x[4+idx] = s5;
184915091d37SBarry Smith   }
185015091d37SBarry Smith   /* backward solve the upper triangular */
185115091d37SBarry Smith   for (i=n-1; i>=0; i--){
185215091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
185315091d37SBarry Smith     vi   = aj + diag[i] + 1;
185415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
185515091d37SBarry Smith     idt  = 5*i;
1856f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1857f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
185815091d37SBarry Smith     while (nz--) {
185915091d37SBarry Smith       idx   = 5*(*vi++);
186015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1861f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1862f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1863f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1864f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1865f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
186615091d37SBarry Smith       v    += 25;
186715091d37SBarry Smith     }
186815091d37SBarry Smith     v        = aa + 25*diag[i];
1869f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1870f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1871f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1872f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1873f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
187415091d37SBarry Smith   }
187515091d37SBarry Smith 
1876d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18771ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1878dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
187915091d37SBarry Smith   PetscFunctionReturn(0);
188015091d37SBarry Smith }
188115091d37SBarry Smith 
18824a2ae208SSatish Balay #undef __FUNCT__
1883cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
1884cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1885cee9d6f2SShri Abhyankar {
1886cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1887cee9d6f2SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1888cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
1889cee9d6f2SShri Abhyankar   PetscInt          jdx;
1890cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1891cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1892cee9d6f2SShri Abhyankar   const PetscScalar *b;
1893cee9d6f2SShri Abhyankar 
1894cee9d6f2SShri Abhyankar   PetscFunctionBegin;
1895cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1896cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1897cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
1898cee9d6f2SShri Abhyankar   idx    = 0;
1899cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1900cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
1901cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
1902cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
1903cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
1904cee9d6f2SShri Abhyankar     idx = 5*i;
1905cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1906cee9d6f2SShri Abhyankar     while (nz--) {
1907cee9d6f2SShri Abhyankar       jdx   = 5*(*vi++);
1908cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1909cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1910cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1911cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1912cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1913cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1914cee9d6f2SShri Abhyankar       v    += 25;
1915cee9d6f2SShri Abhyankar     }
1916cee9d6f2SShri Abhyankar     x[idx]   = s1;
1917cee9d6f2SShri Abhyankar     x[1+idx] = s2;
1918cee9d6f2SShri Abhyankar     x[2+idx] = s3;
1919cee9d6f2SShri Abhyankar     x[3+idx] = s4;
1920cee9d6f2SShri Abhyankar     x[4+idx] = s5;
1921cee9d6f2SShri Abhyankar   }
1922cee9d6f2SShri Abhyankar 
1923cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
1924cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1925cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
1926cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
1927cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1928cee9d6f2SShri Abhyankar     idt = 5*i;
1929cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
1930cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1931cee9d6f2SShri Abhyankar     while (nz--) {
1932cee9d6f2SShri Abhyankar       idx   = 5*(*vi++);
1933cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1934cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1935cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1936cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1937cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1938cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1939cee9d6f2SShri Abhyankar       v    += 25;
1940cee9d6f2SShri Abhyankar     }
1941cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1942cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1943cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1944cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1945cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1946cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1947cee9d6f2SShri Abhyankar   }
1948cee9d6f2SShri Abhyankar 
1949cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1950cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1952cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1953cee9d6f2SShri Abhyankar }
1954cee9d6f2SShri Abhyankar 
1955cee9d6f2SShri Abhyankar #undef __FUNCT__
19564a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1957dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
19584e2b4712SSatish Balay {
19594e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
19604e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
19616849ba73SBarry Smith   PetscErrorCode    ierr;
19625d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
19635d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1964d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1965d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1966d9fead3dSBarry Smith   const PetscScalar *b;
19674e2b4712SSatish Balay 
19684e2b4712SSatish Balay   PetscFunctionBegin;
1969d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1971f1af5d2fSBarry Smith   t  = a->solve_work;
19724e2b4712SSatish Balay 
19734e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
19744e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
19754e2b4712SSatish Balay 
19764e2b4712SSatish Balay   /* forward solve the lower triangular */
19774e2b4712SSatish Balay   idx    = 4*(*r++);
1978f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1979f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
19804e2b4712SSatish Balay   for (i=1; i<n; i++) {
19814e2b4712SSatish Balay     v     = aa + 16*ai[i];
19824e2b4712SSatish Balay     vi    = aj + ai[i];
19834e2b4712SSatish Balay     nz    = diag[i] - ai[i];
19844e2b4712SSatish Balay     idx   = 4*(*r++);
1985f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
19864e2b4712SSatish Balay     while (nz--) {
19874e2b4712SSatish Balay       idx   = 4*(*vi++);
1988f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1989f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1990f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1991f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1992f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
19934e2b4712SSatish Balay       v    += 16;
19944e2b4712SSatish Balay     }
19954e2b4712SSatish Balay     idx        = 4*i;
1996f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1997f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
19984e2b4712SSatish Balay   }
19994e2b4712SSatish Balay   /* backward solve the upper triangular */
20004e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
20014e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
20024e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
20034e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
20044e2b4712SSatish Balay     idt  = 4*i;
2005f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2006f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
20074e2b4712SSatish Balay     while (nz--) {
20084e2b4712SSatish Balay       idx   = 4*(*vi++);
2009f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2010f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2011f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2012f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2013f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2014f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
20154e2b4712SSatish Balay       v += 16;
20164e2b4712SSatish Balay     }
20174e2b4712SSatish Balay     idc      = 4*(*c--);
20184e2b4712SSatish Balay     v        = aa + 16*diag[i];
2019f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2020f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2021f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2022f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
20234e2b4712SSatish Balay   }
20244e2b4712SSatish Balay 
20254e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
20264e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2027d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20281ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2029dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
20304e2b4712SSatish Balay   PetscFunctionReturn(0);
20314e2b4712SSatish Balay }
2032f26ec98cSKris Buschelman 
2033f26ec98cSKris Buschelman #undef __FUNCT__
2034f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2035dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2036f26ec98cSKris Buschelman {
2037f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2038f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
20396849ba73SBarry Smith   PetscErrorCode    ierr;
20405d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
20415d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2042d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2043d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2044d9fead3dSBarry Smith   PetscScalar       *x;
2045d9fead3dSBarry Smith   const PetscScalar *b;
2046f26ec98cSKris Buschelman 
2047f26ec98cSKris Buschelman   PetscFunctionBegin;
2048d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20491ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2050f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2051f26ec98cSKris Buschelman 
2052f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2053f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2054f26ec98cSKris Buschelman 
2055f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2056f26ec98cSKris Buschelman   idx    = 4*(*r++);
2057f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2058f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2059f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2060f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2061f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2062f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2063f26ec98cSKris Buschelman     vi    = aj + ai[i];
2064f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2065f26ec98cSKris Buschelman     idx   = 4*(*r++);
2066f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2067f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2068f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2069f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2070f26ec98cSKris Buschelman     while (nz--) {
2071f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2072f26ec98cSKris Buschelman       x1  = t[idx];
2073f26ec98cSKris Buschelman       x2  = t[1+idx];
2074f26ec98cSKris Buschelman       x3  = t[2+idx];
2075f26ec98cSKris Buschelman       x4  = t[3+idx];
2076f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2077f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2078f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2079f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2080f26ec98cSKris Buschelman       v    += 16;
2081f26ec98cSKris Buschelman     }
2082f26ec98cSKris Buschelman     idx        = 4*i;
2083f26ec98cSKris Buschelman     t[idx]   = s1;
2084f26ec98cSKris Buschelman     t[1+idx] = s2;
2085f26ec98cSKris Buschelman     t[2+idx] = s3;
2086f26ec98cSKris Buschelman     t[3+idx] = s4;
2087f26ec98cSKris Buschelman   }
2088f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2089f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2090f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2091f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2092f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2093f26ec98cSKris Buschelman     idt  = 4*i;
2094f26ec98cSKris Buschelman     s1 = t[idt];
2095f26ec98cSKris Buschelman     s2 = t[1+idt];
2096f26ec98cSKris Buschelman     s3 = t[2+idt];
2097f26ec98cSKris Buschelman     s4 = t[3+idt];
2098f26ec98cSKris Buschelman     while (nz--) {
2099f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2100f26ec98cSKris Buschelman       x1  = t[idx];
2101f26ec98cSKris Buschelman       x2  = t[1+idx];
2102f26ec98cSKris Buschelman       x3  = t[2+idx];
2103f26ec98cSKris Buschelman       x4  = t[3+idx];
2104f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2105f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2106f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2107f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2108f26ec98cSKris Buschelman       v += 16;
2109f26ec98cSKris Buschelman     }
2110f26ec98cSKris Buschelman     idc      = 4*(*c--);
2111f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
2112f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2113f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2114f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2115f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2116f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2117f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2118f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2119f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2120f26ec98cSKris Buschelman  }
2121f26ec98cSKris Buschelman 
2122f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2123f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2124d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21251ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2126dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2127f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2128f26ec98cSKris Buschelman }
2129f26ec98cSKris Buschelman 
213024c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
213124c233c2SKris Buschelman 
213224c233c2SKris Buschelman #include PETSC_HAVE_SSE
213324c233c2SKris Buschelman 
213424c233c2SKris Buschelman #undef __FUNCT__
213524c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2136dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
213724c233c2SKris Buschelman {
213824c233c2SKris Buschelman   /*
213924c233c2SKris Buschelman      Note: This code uses demotion of double
214024c233c2SKris Buschelman      to float when performing the mixed-mode computation.
214124c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
214224c233c2SKris Buschelman   */
214324c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
214424c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
21456849ba73SBarry Smith   PetscErrorCode ierr;
21465d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
21475d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
214824c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
214987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
215024c233c2SKris Buschelman 
215124c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
215224c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
215324c233c2SKris Buschelman   unsigned long   offset;
215424c233c2SKris Buschelman 
215524c233c2SKris Buschelman   PetscFunctionBegin;
215624c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
215724c233c2SKris Buschelman 
215824c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
215924c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
216024c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
216124c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
216224c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
216324c233c2SKris Buschelman 
21641ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
21651ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
216624c233c2SKris Buschelman     t  = a->solve_work;
216724c233c2SKris Buschelman 
216824c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
216924c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
217024c233c2SKris Buschelman 
217124c233c2SKris Buschelman     /* forward solve the lower triangular */
217224c233c2SKris Buschelman     idx  = 4*(*r++);
217324c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
217424c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
217524c233c2SKris Buschelman     v    =  aa + 16*ai[1];
217624c233c2SKris Buschelman 
217724c233c2SKris Buschelman     for (i=1; i<n;) {
217824c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
217924c233c2SKris Buschelman       vi   =  aj      + ai[i];
218024c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
218124c233c2SKris Buschelman       idx  =  4*(*r++);
218224c233c2SKris Buschelman 
218324c233c2SKris Buschelman       /* Demote sum from double to float */
218424c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
218524c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
218624c233c2SKris Buschelman 
218724c233c2SKris Buschelman       while (nz--) {
218824c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
218924c233c2SKris Buschelman         idx = 4*(*vi++);
219024c233c2SKris Buschelman 
219124c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
219224c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
219324c233c2SKris Buschelman 
219424c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
219524c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
219624c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
219724c233c2SKris Buschelman 
219824c233c2SKris Buschelman           /* First Column */
219924c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
220024c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
220124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
220224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
220324c233c2SKris Buschelman 
220424c233c2SKris Buschelman           /* Second Column */
220524c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
220624c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
220724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
220824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
220924c233c2SKris Buschelman 
221024c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
221124c233c2SKris Buschelman 
221224c233c2SKris Buschelman           /* Third Column */
221324c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
221424c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
221524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
221624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
221724c233c2SKris Buschelman 
221824c233c2SKris Buschelman           /* Fourth Column */
221924c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
222024c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
222124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
222224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
222324c233c2SKris Buschelman         SSE_INLINE_END_2
222424c233c2SKris Buschelman 
222524c233c2SKris Buschelman         v  += 16;
222624c233c2SKris Buschelman       }
222724c233c2SKris Buschelman       idx = 4*i;
222824c233c2SKris Buschelman       v   = aa + 16*ai[++i];
222924c233c2SKris Buschelman       PREFETCH_NTA(v);
223024c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
223124c233c2SKris Buschelman 
223224c233c2SKris Buschelman       /* Promote result from float to double */
223324c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
223424c233c2SKris Buschelman     }
223524c233c2SKris Buschelman     /* backward solve the upper triangular */
223624c233c2SKris Buschelman     idt  = 4*(n-1);
223724c233c2SKris Buschelman     ai16 = 16*diag[n-1];
223824c233c2SKris Buschelman     v    = aa + ai16 + 16;
223924c233c2SKris Buschelman     for (i=n-1; i>=0;){
224024c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
224124c233c2SKris Buschelman       vi = aj + diag[i] + 1;
224224c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
224324c233c2SKris Buschelman 
224424c233c2SKris Buschelman       /* Demote accumulator from double to float */
224524c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
224624c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
224724c233c2SKris Buschelman 
224824c233c2SKris Buschelman       while (nz--) {
224924c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
225024c233c2SKris Buschelman         idx = 4*(*vi++);
225124c233c2SKris Buschelman 
225224c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
225324c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
225424c233c2SKris Buschelman 
225524c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
225624c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
225724c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
225824c233c2SKris Buschelman 
225924c233c2SKris Buschelman           /* First Column */
226024c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
226124c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
226224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
226324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
226424c233c2SKris Buschelman 
226524c233c2SKris Buschelman           /* Second Column */
226624c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
226724c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
226824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
226924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
227024c233c2SKris Buschelman 
227124c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
227224c233c2SKris Buschelman 
227324c233c2SKris Buschelman           /* Third Column */
227424c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
227524c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
227624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
227724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
227824c233c2SKris Buschelman 
227924c233c2SKris Buschelman           /* Fourth Column */
228024c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
228124c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
228224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
228324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
228424c233c2SKris Buschelman         SSE_INLINE_END_2
228524c233c2SKris Buschelman         v  += 16;
228624c233c2SKris Buschelman       }
228724c233c2SKris Buschelman       v    = aa + ai16;
228824c233c2SKris Buschelman       ai16 = 16*diag[--i];
228924c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
229024c233c2SKris Buschelman       /*
229124c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
229224c233c2SKris Buschelman          which was inverted as part of the factorization
229324c233c2SKris Buschelman       */
229424c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
229524c233c2SKris Buschelman         /* First Column */
229624c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
229724c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
229824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
229924c233c2SKris Buschelman 
230024c233c2SKris Buschelman         /* Second Column */
230124c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
230224c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
230324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
230424c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
230524c233c2SKris Buschelman 
230624c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
230724c233c2SKris Buschelman 
230824c233c2SKris Buschelman         /* Third Column */
230924c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
231024c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
231124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
231224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
231324c233c2SKris Buschelman 
231424c233c2SKris Buschelman         /* Fourth Column */
231524c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
231624c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
231724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
231824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
231924c233c2SKris Buschelman 
232024c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
232124c233c2SKris Buschelman       SSE_INLINE_END_3
232224c233c2SKris Buschelman 
232324c233c2SKris Buschelman       /* Promote solution from float to double */
232424c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
232524c233c2SKris Buschelman 
232624c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
232724c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
232824c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
232924c233c2SKris Buschelman       idc  = 4*(*c--);
233024c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
233124c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
233224c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
233324c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
233424c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
233524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
233624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
233724c233c2SKris Buschelman       SSE_INLINE_END_2
233824c233c2SKris Buschelman       v    = aa + ai16 + 16;
233924c233c2SKris Buschelman       idt -= 4;
234024c233c2SKris Buschelman     }
234124c233c2SKris Buschelman 
234224c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
234324c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23441ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
23451ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2346dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
234724c233c2SKris Buschelman   SSE_SCOPE_END;
234824c233c2SKris Buschelman   PetscFunctionReturn(0);
234924c233c2SKris Buschelman }
235024c233c2SKris Buschelman 
235124c233c2SKris Buschelman #endif
23520ef38995SBarry Smith 
23530ef38995SBarry Smith 
23544e2b4712SSatish Balay /*
23554e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
23564e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
23574e2b4712SSatish Balay */
23584a2ae208SSatish Balay #undef __FUNCT__
23594a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2360dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
23614e2b4712SSatish Balay {
23624e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2363356650c2SBarry Smith   PetscInt          n=a->mbs;
2364356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2365dfbe8321SBarry Smith   PetscErrorCode    ierr;
2366356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2367d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2368d9fead3dSBarry Smith   PetscScalar       *x;
2369d9fead3dSBarry Smith   const PetscScalar *b;
23704e2b4712SSatish Balay 
23714e2b4712SSatish Balay   PetscFunctionBegin;
2372d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23744e2b4712SSatish Balay 
2375aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
23762853dc0eSBarry Smith   {
237787828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
23782853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
23792853dc0eSBarry Smith   }
2380aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23812853dc0eSBarry Smith   {
238287828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
23832853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
23842853dc0eSBarry Smith   }
2385aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
23862853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2387e1293385SBarry Smith #else
238830d4dcafSBarry Smith   {
238987828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2390d9fead3dSBarry Smith     const MatScalar *v;
2391356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2392356650c2SBarry Smith     const PetscInt  *vi;
2393e1293385SBarry Smith 
23944e2b4712SSatish Balay   /* forward solve the lower triangular */
23954e2b4712SSatish Balay   idx    = 0;
2396e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
23974e2b4712SSatish Balay   for (i=1; i<n; i++) {
23984e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
23994e2b4712SSatish Balay     vi    =  aj      + ai[i];
24004e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2401e1293385SBarry Smith     idx   +=  4;
2402f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
24034e2b4712SSatish Balay     while (nz--) {
24044e2b4712SSatish Balay       jdx   = 4*(*vi++);
24054e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2406f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2407f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2408f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2409f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
24104e2b4712SSatish Balay       v    += 16;
24114e2b4712SSatish Balay     }
2412f1af5d2fSBarry Smith     x[idx]   = s1;
2413f1af5d2fSBarry Smith     x[1+idx] = s2;
2414f1af5d2fSBarry Smith     x[2+idx] = s3;
2415f1af5d2fSBarry Smith     x[3+idx] = s4;
24164e2b4712SSatish Balay   }
24174e2b4712SSatish Balay   /* backward solve the upper triangular */
24184e555682SBarry Smith   idt = 4*(n-1);
24194e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
24204e555682SBarry Smith     ai16 = 16*diag[i];
24214e555682SBarry Smith     v    = aa + ai16 + 16;
24224e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
24234e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2424f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2425f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
24264e2b4712SSatish Balay     while (nz--) {
24274e2b4712SSatish Balay       idx   = 4*(*vi++);
24284e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2429f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2430f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2431f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2432f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
24334e2b4712SSatish Balay       v    += 16;
24344e2b4712SSatish Balay     }
24354e555682SBarry Smith     v        = aa + ai16;
2436f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2437f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2438f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2439f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2440329f5518SBarry Smith     idt -= 4;
24414e2b4712SSatish Balay   }
244230d4dcafSBarry Smith   }
2443e1293385SBarry Smith #endif
24444e2b4712SSatish Balay 
2445d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24461ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2447dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
24484e2b4712SSatish Balay   PetscFunctionReturn(0);
24494e2b4712SSatish Balay }
24504e2b4712SSatish Balay 
2451f26ec98cSKris Buschelman #undef __FUNCT__
2452cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2453cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2454cee9d6f2SShri Abhyankar {
2455cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2456cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2457cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
2458cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
2459cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2460cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2461cee9d6f2SShri Abhyankar     PetscScalar       *x;
2462cee9d6f2SShri Abhyankar     const PetscScalar *b;
2463cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2464cee9d6f2SShri Abhyankar 
2465cee9d6f2SShri Abhyankar     PetscFunctionBegin;
2466cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2467cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2468cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
2469cee9d6f2SShri Abhyankar     idx    = 0;
2470cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2471cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
2472cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
2473cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
2474cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
2475cee9d6f2SShri Abhyankar       idx   = bs*i;
2476cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2477cee9d6f2SShri Abhyankar        while (nz--) {
2478cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
2479cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2480cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2481cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2482cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2483cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2484cee9d6f2SShri Abhyankar 
2485cee9d6f2SShri Abhyankar           v   +=  bs2;
2486cee9d6f2SShri Abhyankar         }
2487cee9d6f2SShri Abhyankar 
2488cee9d6f2SShri Abhyankar        x[idx]   = s1;
2489cee9d6f2SShri Abhyankar        x[1+idx] = s2;
2490cee9d6f2SShri Abhyankar        x[2+idx] = s3;
2491cee9d6f2SShri Abhyankar        x[3+idx] = s4;
2492cee9d6f2SShri Abhyankar     }
2493cee9d6f2SShri Abhyankar 
2494cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
2495cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2496cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
2497cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
2498cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2499cee9d6f2SShri Abhyankar      idt = bs*i;
2500cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2501cee9d6f2SShri Abhyankar 
2502cee9d6f2SShri Abhyankar     while (nz--) {
2503cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
2504cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2505cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2506cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2507cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2508cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2509cee9d6f2SShri Abhyankar 
2510cee9d6f2SShri Abhyankar         v   +=  bs2;
2511cee9d6f2SShri Abhyankar     }
2512cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2513cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2514cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2515cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2516cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2517cee9d6f2SShri Abhyankar 
2518cee9d6f2SShri Abhyankar   }
2519cee9d6f2SShri Abhyankar 
2520cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2521cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2522cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2523cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2524cee9d6f2SShri Abhyankar }
2525cee9d6f2SShri Abhyankar 
2526cee9d6f2SShri Abhyankar 
2527cee9d6f2SShri Abhyankar 
2528cee9d6f2SShri Abhyankar #undef __FUNCT__
2529f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2530dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2531f26ec98cSKris Buschelman {
2532f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2533690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2534dfbe8321SBarry Smith   PetscErrorCode ierr;
2535690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2536f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2537f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2538f26ec98cSKris Buschelman 
2539f26ec98cSKris Buschelman   PetscFunctionBegin;
25401ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2542f26ec98cSKris Buschelman 
2543f26ec98cSKris Buschelman   {
2544f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2545f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2546690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2547f26ec98cSKris Buschelman 
2548f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2549f26ec98cSKris Buschelman     idx  = 0;
2550f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2551f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2552f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2553f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2554f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2555f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2556f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2557f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2558f26ec98cSKris Buschelman       idx   +=  4;
2559f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2560f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2561f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2562f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2563f26ec98cSKris Buschelman       while (nz--) {
2564f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2565f26ec98cSKris Buschelman         x1  = t[jdx];
2566f26ec98cSKris Buschelman         x2  = t[1+jdx];
2567f26ec98cSKris Buschelman         x3  = t[2+jdx];
2568f26ec98cSKris Buschelman         x4  = t[3+jdx];
2569f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2570f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2571f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2572f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2573f26ec98cSKris Buschelman         v    += 16;
2574f26ec98cSKris Buschelman       }
2575f26ec98cSKris Buschelman       t[idx]   = s1;
2576f26ec98cSKris Buschelman       t[1+idx] = s2;
2577f26ec98cSKris Buschelman       t[2+idx] = s3;
2578f26ec98cSKris Buschelman       t[3+idx] = s4;
2579f26ec98cSKris Buschelman     }
2580f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2581f26ec98cSKris Buschelman     idt = 4*(n-1);
2582f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2583f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2584f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2585f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2586f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2587f26ec98cSKris Buschelman       s1   = t[idt];
2588f26ec98cSKris Buschelman       s2   = t[1+idt];
2589f26ec98cSKris Buschelman       s3   = t[2+idt];
2590f26ec98cSKris Buschelman       s4   = t[3+idt];
2591f26ec98cSKris Buschelman       while (nz--) {
2592f26ec98cSKris Buschelman         idx = 4*(*vi++);
2593f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2594f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2595f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2596f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2597f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2598f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2599f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2600f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2601f26ec98cSKris Buschelman         v    += 16;
2602f26ec98cSKris Buschelman       }
2603f26ec98cSKris Buschelman       v        = aa + ai16;
2604f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2605f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2606f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2607f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2608f26ec98cSKris Buschelman       idt -= 4;
2609f26ec98cSKris Buschelman     }
2610f26ec98cSKris Buschelman   }
2611f26ec98cSKris Buschelman 
26121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
26131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2614dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2615f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2616f26ec98cSKris Buschelman }
2617f26ec98cSKris Buschelman 
26183660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
26193660e330SKris Buschelman 
26203660e330SKris Buschelman #include PETSC_HAVE_SSE
26213660e330SKris Buschelman #undef __FUNCT__
26227cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2623dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
26243660e330SKris Buschelman {
26253660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
26262aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2627dfbe8321SBarry Smith   PetscErrorCode ierr;
2628dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
26293660e330SKris Buschelman   MatScalar      *aa=a->a;
263087828ca2SBarry Smith   PetscScalar    *x,*b;
26313660e330SKris Buschelman 
26323660e330SKris Buschelman   PetscFunctionBegin;
26333660e330SKris Buschelman   SSE_SCOPE_BEGIN;
26343660e330SKris Buschelman   /*
26353660e330SKris Buschelman      Note: This code currently uses demotion of double
26363660e330SKris Buschelman      to float when performing the mixed-mode computation.
26373660e330SKris Buschelman      This may not be numerically reasonable for all applications.
26383660e330SKris Buschelman   */
26393660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
26403660e330SKris Buschelman 
26411ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26421ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26433660e330SKris Buschelman   {
2644eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2645eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
26462aa5897fSKris Buschelman     int            nz,i,idt,ai16;
26472aa5897fSKris Buschelman     unsigned int   jdx,idx;
26482aa5897fSKris Buschelman     unsigned short *vi;
2649eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
26503660e330SKris Buschelman 
2651eb05f457SKris Buschelman     /* First block is the identity. */
26523660e330SKris Buschelman     idx  = 0;
2653eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
26542aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
26553660e330SKris Buschelman 
26563660e330SKris Buschelman     for (i=1; i<n;) {
26573660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
26583660e330SKris Buschelman       vi   =  aj      + ai[i];
26593660e330SKris Buschelman       nz   =  diag[i] - ai[i];
26603660e330SKris Buschelman       idx +=  4;
26613660e330SKris Buschelman 
2662eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2663eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2664eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
26653660e330SKris Buschelman 
26663660e330SKris Buschelman       while (nz--) {
26673660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
26682aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
26693660e330SKris Buschelman 
26703660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2671eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
26723660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26733660e330SKris Buschelman 
26743660e330SKris Buschelman           /* First Column */
26753660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26763660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26773660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26783660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26793660e330SKris Buschelman 
26803660e330SKris Buschelman           /* Second Column */
26813660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26823660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26833660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26843660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26853660e330SKris Buschelman 
26863660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26873660e330SKris Buschelman 
26883660e330SKris Buschelman           /* Third Column */
26893660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26903660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26913660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26923660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26933660e330SKris Buschelman 
26943660e330SKris Buschelman           /* Fourth Column */
26953660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26963660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26973660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26983660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26993660e330SKris Buschelman         SSE_INLINE_END_2
27003660e330SKris Buschelman 
27013660e330SKris Buschelman         v  += 16;
27023660e330SKris Buschelman       }
27033660e330SKris Buschelman       v    =  aa + 16*ai[++i];
27043660e330SKris Buschelman       PREFETCH_NTA(v);
2705eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
27063660e330SKris Buschelman     }
2707eb05f457SKris Buschelman 
2708eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2709eb05f457SKris Buschelman 
27103660e330SKris Buschelman     idt  = 4*(n-1);
27113660e330SKris Buschelman     ai16 = 16*diag[n-1];
27123660e330SKris Buschelman     v    = aa + ai16 + 16;
27133660e330SKris Buschelman     for (i=n-1; i>=0;){
27143660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
27153660e330SKris Buschelman       vi = aj + diag[i] + 1;
27163660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
27173660e330SKris Buschelman 
2718eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
27193660e330SKris Buschelman 
27203660e330SKris Buschelman       while (nz--) {
27213660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
27222aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
27233660e330SKris Buschelman 
27243660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2725eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
27263660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
27273660e330SKris Buschelman 
27283660e330SKris Buschelman           /* First Column */
27293660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
27303660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
27313660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
27323660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
27333660e330SKris Buschelman 
27343660e330SKris Buschelman           /* Second Column */
27353660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
27363660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
27373660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
27383660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
27393660e330SKris Buschelman 
27403660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
27413660e330SKris Buschelman 
27423660e330SKris Buschelman           /* Third Column */
27433660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
27443660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
27453660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
27463660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
27473660e330SKris Buschelman 
27483660e330SKris Buschelman           /* Fourth Column */
27493660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
27503660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
27513660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
27523660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
27533660e330SKris Buschelman         SSE_INLINE_END_2
27543660e330SKris Buschelman         v  += 16;
27553660e330SKris Buschelman       }
27563660e330SKris Buschelman       v    = aa + ai16;
27573660e330SKris Buschelman       ai16 = 16*diag[--i];
27583660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
27593660e330SKris Buschelman       /*
27603660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
27613660e330SKris Buschelman          which was inverted as part of the factorization
27623660e330SKris Buschelman       */
2763eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
27643660e330SKris Buschelman         /* First Column */
27653660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
27663660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
27673660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
27683660e330SKris Buschelman 
27693660e330SKris Buschelman         /* Second Column */
27703660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
27713660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
27723660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
27733660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
27743660e330SKris Buschelman 
27753660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
27763660e330SKris Buschelman 
27773660e330SKris Buschelman         /* Third Column */
27783660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
27793660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
27803660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
27813660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
27823660e330SKris Buschelman 
27833660e330SKris Buschelman         /* Fourth Column */
27843660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
27853660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
27863660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
27873660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
27883660e330SKris Buschelman 
27893660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
27903660e330SKris Buschelman       SSE_INLINE_END_3
27913660e330SKris Buschelman 
27923660e330SKris Buschelman       v    = aa + ai16 + 16;
27933660e330SKris Buschelman       idt -= 4;
27943660e330SKris Buschelman     }
2795eb05f457SKris Buschelman 
2796eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2797eb05f457SKris Buschelman     idt = 4*(n-1);
2798eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2799eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2800eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2801eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2802eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2803eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2804eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2805eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2806eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
280754693613SKris Buschelman       idt -= 4;
28083660e330SKris Buschelman     }
2809eb05f457SKris Buschelman 
2810eb05f457SKris Buschelman   } /* End of artificial scope. */
28111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
28121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2813dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
28143660e330SKris Buschelman   SSE_SCOPE_END;
28153660e330SKris Buschelman   PetscFunctionReturn(0);
28163660e330SKris Buschelman }
28173660e330SKris Buschelman 
28187cf1b8d3SKris Buschelman #undef __FUNCT__
28197cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2820dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
28217cf1b8d3SKris Buschelman {
28227cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
28237cf1b8d3SKris Buschelman   int            *aj=a->j;
2824dfbe8321SBarry Smith   PetscErrorCode ierr;
2825dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
28267cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
28277cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
28287cf1b8d3SKris Buschelman 
28297cf1b8d3SKris Buschelman   PetscFunctionBegin;
28307cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
28317cf1b8d3SKris Buschelman   /*
28327cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
28337cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
28347cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
28357cf1b8d3SKris Buschelman   */
28367cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
28377cf1b8d3SKris Buschelman 
28381ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
28391ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28407cf1b8d3SKris Buschelman   {
28417cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
28427cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
28437cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
28447cf1b8d3SKris Buschelman     int       jdx,idx;
28457cf1b8d3SKris Buschelman     int       *vi;
28467cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
28477cf1b8d3SKris Buschelman 
28487cf1b8d3SKris Buschelman     /* First block is the identity. */
28497cf1b8d3SKris Buschelman     idx  = 0;
28507cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
28517cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
28527cf1b8d3SKris Buschelman 
28537cf1b8d3SKris Buschelman     for (i=1; i<n;) {
28547cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
28557cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
28567cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
28577cf1b8d3SKris Buschelman       idx +=  4;
28587cf1b8d3SKris Buschelman 
28597cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
28607cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
28617cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
28627cf1b8d3SKris Buschelman 
28637cf1b8d3SKris Buschelman       while (nz--) {
28647cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
28657cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
28667cf1b8d3SKris Buschelman /*          jdx = *vi++; */
28677cf1b8d3SKris Buschelman 
28687cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
28697cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
28707cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
28717cf1b8d3SKris Buschelman 
28727cf1b8d3SKris Buschelman           /* First Column */
28737cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
28747cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
28757cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
28767cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
28777cf1b8d3SKris Buschelman 
28787cf1b8d3SKris Buschelman           /* Second Column */
28797cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
28807cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
28817cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
28827cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
28837cf1b8d3SKris Buschelman 
28847cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
28857cf1b8d3SKris Buschelman 
28867cf1b8d3SKris Buschelman           /* Third Column */
28877cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
28887cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
28897cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
28907cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
28917cf1b8d3SKris Buschelman 
28927cf1b8d3SKris Buschelman           /* Fourth Column */
28937cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
28947cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
28957cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
28967cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
28977cf1b8d3SKris Buschelman         SSE_INLINE_END_2
28987cf1b8d3SKris Buschelman 
28997cf1b8d3SKris Buschelman         v  += 16;
29007cf1b8d3SKris Buschelman       }
29017cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
29027cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
29037cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
29047cf1b8d3SKris Buschelman     }
29057cf1b8d3SKris Buschelman 
29067cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
29077cf1b8d3SKris Buschelman 
29087cf1b8d3SKris Buschelman     idt  = 4*(n-1);
29097cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
29107cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
29117cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
29127cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
29137cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
29147cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
29157cf1b8d3SKris Buschelman 
29167cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
29177cf1b8d3SKris Buschelman 
29187cf1b8d3SKris Buschelman       while (nz--) {
29197cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
29207cf1b8d3SKris Buschelman         idx = 4*(*vi++);
29217cf1b8d3SKris Buschelman /*          idx = *vi++; */
29227cf1b8d3SKris Buschelman 
29237cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
29247cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
29257cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
29267cf1b8d3SKris Buschelman 
29277cf1b8d3SKris Buschelman           /* First Column */
29287cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
29297cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
29307cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
29317cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
29327cf1b8d3SKris Buschelman 
29337cf1b8d3SKris Buschelman           /* Second Column */
29347cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
29357cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
29367cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
29377cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
29387cf1b8d3SKris Buschelman 
29397cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
29407cf1b8d3SKris Buschelman 
29417cf1b8d3SKris Buschelman           /* Third Column */
29427cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
29437cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
29447cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
29457cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
29467cf1b8d3SKris Buschelman 
29477cf1b8d3SKris Buschelman           /* Fourth Column */
29487cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
29497cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
29507cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
29517cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
29527cf1b8d3SKris Buschelman         SSE_INLINE_END_2
29537cf1b8d3SKris Buschelman         v  += 16;
29547cf1b8d3SKris Buschelman       }
29557cf1b8d3SKris Buschelman       v    = aa + ai16;
29567cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
29577cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
29587cf1b8d3SKris Buschelman       /*
29597cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
29607cf1b8d3SKris Buschelman          which was inverted as part of the factorization
29617cf1b8d3SKris Buschelman       */
29627cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
29637cf1b8d3SKris Buschelman         /* First Column */
29647cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
29657cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
29667cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
29677cf1b8d3SKris Buschelman 
29687cf1b8d3SKris Buschelman         /* Second Column */
29697cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
29707cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
29717cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
29727cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
29737cf1b8d3SKris Buschelman 
29747cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
29757cf1b8d3SKris Buschelman 
29767cf1b8d3SKris Buschelman         /* Third Column */
29777cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
29787cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
29797cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
29807cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
29817cf1b8d3SKris Buschelman 
29827cf1b8d3SKris Buschelman         /* Fourth Column */
29837cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
29847cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
29857cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
29867cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
29877cf1b8d3SKris Buschelman 
29887cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
29897cf1b8d3SKris Buschelman       SSE_INLINE_END_3
29907cf1b8d3SKris Buschelman 
29917cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
29927cf1b8d3SKris Buschelman       idt -= 4;
29937cf1b8d3SKris Buschelman     }
29947cf1b8d3SKris Buschelman 
29957cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
29967cf1b8d3SKris Buschelman     idt = 4*(n-1);
29977cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
29987cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
29997cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
30007cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
30017cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
30027cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
30037cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
30047cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
30057cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
30067cf1b8d3SKris Buschelman       idt -= 4;
30077cf1b8d3SKris Buschelman     }
30087cf1b8d3SKris Buschelman 
30097cf1b8d3SKris Buschelman   } /* End of artificial scope. */
30101ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3012dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
30137cf1b8d3SKris Buschelman   SSE_SCOPE_END;
30147cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
30157cf1b8d3SKris Buschelman }
30167cf1b8d3SKris Buschelman 
30173660e330SKris Buschelman #endif
30184a2ae208SSatish Balay #undef __FUNCT__
30194a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3020dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
30214e2b4712SSatish Balay {
30224e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
30234e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
30246849ba73SBarry Smith   PetscErrorCode    ierr;
30255d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
30265d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3027d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3028d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3029d9fead3dSBarry Smith   const PetscScalar *b;
30304e2b4712SSatish Balay 
30314e2b4712SSatish Balay   PetscFunctionBegin;
3032d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30331ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3034f1af5d2fSBarry Smith   t  = a->solve_work;
30354e2b4712SSatish Balay 
30364e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30374e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
30384e2b4712SSatish Balay 
30394e2b4712SSatish Balay   /* forward solve the lower triangular */
30404e2b4712SSatish Balay   idx    = 3*(*r++);
3041f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
30424e2b4712SSatish Balay   for (i=1; i<n; i++) {
30434e2b4712SSatish Balay     v     = aa + 9*ai[i];
30444e2b4712SSatish Balay     vi    = aj + ai[i];
30454e2b4712SSatish Balay     nz    = diag[i] - ai[i];
30464e2b4712SSatish Balay     idx   = 3*(*r++);
3047f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
30484e2b4712SSatish Balay     while (nz--) {
30494e2b4712SSatish Balay       idx   = 3*(*vi++);
3050f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3051f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3052f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3053f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
30544e2b4712SSatish Balay       v += 9;
30554e2b4712SSatish Balay     }
30564e2b4712SSatish Balay     idx = 3*i;
3057f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
30584e2b4712SSatish Balay   }
30594e2b4712SSatish Balay   /* backward solve the upper triangular */
30604e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30614e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
30624e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30634e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
30644e2b4712SSatish Balay     idt  = 3*i;
3065f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
30664e2b4712SSatish Balay     while (nz--) {
30674e2b4712SSatish Balay       idx   = 3*(*vi++);
3068f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3069f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3070f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3071f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
30724e2b4712SSatish Balay       v += 9;
30734e2b4712SSatish Balay     }
30744e2b4712SSatish Balay     idc = 3*(*c--);
30754e2b4712SSatish Balay     v   = aa + 9*diag[i];
3076f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3077f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3078f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
30794e2b4712SSatish Balay   }
30804e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30814e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3082d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30831ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3084dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
30854e2b4712SSatish Balay   PetscFunctionReturn(0);
30864e2b4712SSatish Balay }
30874e2b4712SSatish Balay 
308815091d37SBarry Smith /*
308915091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
309015091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
309115091d37SBarry Smith */
30924a2ae208SSatish Balay #undef __FUNCT__
30934a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3094dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
309515091d37SBarry Smith {
309615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3097690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3098dfbe8321SBarry Smith   PetscErrorCode    ierr;
3099690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3100d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3101d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3102d9fead3dSBarry Smith   const PetscScalar *b;
3103690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
310415091d37SBarry Smith 
310515091d37SBarry Smith   PetscFunctionBegin;
3106d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31071ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
310815091d37SBarry Smith 
310915091d37SBarry Smith   /* forward solve the lower triangular */
311015091d37SBarry Smith   idx    = 0;
311115091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
311215091d37SBarry Smith   for (i=1; i<n; i++) {
311315091d37SBarry Smith     v     =  aa      + 9*ai[i];
311415091d37SBarry Smith     vi    =  aj      + ai[i];
311515091d37SBarry Smith     nz    =  diag[i] - ai[i];
311615091d37SBarry Smith     idx   +=  3;
3117f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
311815091d37SBarry Smith     while (nz--) {
311915091d37SBarry Smith       jdx   = 3*(*vi++);
312015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3121f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3122f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3123f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
312415091d37SBarry Smith       v    += 9;
312515091d37SBarry Smith     }
3126f1af5d2fSBarry Smith     x[idx]   = s1;
3127f1af5d2fSBarry Smith     x[1+idx] = s2;
3128f1af5d2fSBarry Smith     x[2+idx] = s3;
312915091d37SBarry Smith   }
313015091d37SBarry Smith   /* backward solve the upper triangular */
313115091d37SBarry Smith   for (i=n-1; i>=0; i--){
313215091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
313315091d37SBarry Smith     vi   = aj + diag[i] + 1;
313415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
313515091d37SBarry Smith     idt  = 3*i;
3136f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3137f1af5d2fSBarry Smith     s3 = x[2+idt];
313815091d37SBarry Smith     while (nz--) {
313915091d37SBarry Smith       idx   = 3*(*vi++);
314015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3141f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3142f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3143f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
314415091d37SBarry Smith       v    += 9;
314515091d37SBarry Smith     }
314615091d37SBarry Smith     v        = aa +  9*diag[i];
3147f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3148f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3149f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
315015091d37SBarry Smith   }
315115091d37SBarry Smith 
3152d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3154dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
315515091d37SBarry Smith   PetscFunctionReturn(0);
315615091d37SBarry Smith }
315715091d37SBarry Smith 
31584a2ae208SSatish Balay #undef __FUNCT__
3159cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3160cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3161cee9d6f2SShri Abhyankar {
3162cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3163cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3164cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3165cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3166cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3167cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3168cee9d6f2SShri Abhyankar     PetscScalar       *x;
3169cee9d6f2SShri Abhyankar     const PetscScalar *b;
3170cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
3171cee9d6f2SShri Abhyankar 
3172cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3173cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3174cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3175cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3176cee9d6f2SShri Abhyankar     idx    = 0;
3177cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3178cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3179cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3180cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3181cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3182cee9d6f2SShri Abhyankar       idx   = bs*i;
3183cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3184cee9d6f2SShri Abhyankar        while (nz--) {
3185cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
3186cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3187cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3188cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3189cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3190cee9d6f2SShri Abhyankar 
3191cee9d6f2SShri Abhyankar           v   +=  bs2;
3192cee9d6f2SShri Abhyankar         }
3193cee9d6f2SShri Abhyankar 
3194cee9d6f2SShri Abhyankar        x[idx]   = s1;
3195cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3196cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3197cee9d6f2SShri Abhyankar     }
3198cee9d6f2SShri Abhyankar 
3199cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3200cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3201cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3202cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3203cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3204cee9d6f2SShri Abhyankar      idt = bs*i;
3205cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3206cee9d6f2SShri Abhyankar 
3207cee9d6f2SShri Abhyankar     while (nz--) {
3208cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
3209cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3210cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3211cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3212cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3213cee9d6f2SShri Abhyankar 
3214cee9d6f2SShri Abhyankar         v   +=  bs2;
3215cee9d6f2SShri Abhyankar     }
3216cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3217cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3218cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3219cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3220cee9d6f2SShri Abhyankar 
3221cee9d6f2SShri Abhyankar   }
3222cee9d6f2SShri Abhyankar 
3223cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3224cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3225cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3226cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3227cee9d6f2SShri Abhyankar }
3228cee9d6f2SShri Abhyankar 
3229cee9d6f2SShri Abhyankar #undef __FUNCT__
32304a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3231dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
32324e2b4712SSatish Balay {
32334e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
32344e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
32356849ba73SBarry Smith   PetscErrorCode    ierr;
32365d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
32375d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3238d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3239d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
3240d9fead3dSBarry Smith   const PetscScalar *b;
32414e2b4712SSatish Balay 
32424e2b4712SSatish Balay   PetscFunctionBegin;
3243d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32441ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3245f1af5d2fSBarry Smith   t  = a->solve_work;
32464e2b4712SSatish Balay 
32474e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
32484e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
32494e2b4712SSatish Balay 
32504e2b4712SSatish Balay   /* forward solve the lower triangular */
32514e2b4712SSatish Balay   idx    = 2*(*r++);
3252f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
32534e2b4712SSatish Balay   for (i=1; i<n; i++) {
32544e2b4712SSatish Balay     v     = aa + 4*ai[i];
32554e2b4712SSatish Balay     vi    = aj + ai[i];
32564e2b4712SSatish Balay     nz    = diag[i] - ai[i];
32574e2b4712SSatish Balay     idx   = 2*(*r++);
3258f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
32594e2b4712SSatish Balay     while (nz--) {
32604e2b4712SSatish Balay       idx   = 2*(*vi++);
3261f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3262f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3263f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
32644e2b4712SSatish Balay       v += 4;
32654e2b4712SSatish Balay     }
32664e2b4712SSatish Balay     idx = 2*i;
3267f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
32684e2b4712SSatish Balay   }
32694e2b4712SSatish Balay   /* backward solve the upper triangular */
32704e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
32714e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
32724e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
32734e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
32744e2b4712SSatish Balay     idt  = 2*i;
3275f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
32764e2b4712SSatish Balay     while (nz--) {
32774e2b4712SSatish Balay       idx   = 2*(*vi++);
3278f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3279f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3280f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
32814e2b4712SSatish Balay       v += 4;
32824e2b4712SSatish Balay     }
32834e2b4712SSatish Balay     idc = 2*(*c--);
32844e2b4712SSatish Balay     v   = aa + 4*diag[i];
3285f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3286f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
32874e2b4712SSatish Balay   }
32884e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
32894e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3290d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32911ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3292dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
32934e2b4712SSatish Balay   PetscFunctionReturn(0);
32944e2b4712SSatish Balay }
32954e2b4712SSatish Balay 
329615091d37SBarry Smith /*
329715091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
329815091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
329915091d37SBarry Smith */
33004a2ae208SSatish Balay #undef __FUNCT__
33014a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3302dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
330315091d37SBarry Smith {
330415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3305690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3306dfbe8321SBarry Smith   PetscErrorCode    ierr;
3307690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3308d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3309d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
3310d9fead3dSBarry Smith   const PetscScalar *b;
3311690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
331215091d37SBarry Smith 
331315091d37SBarry Smith   PetscFunctionBegin;
3314d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
331615091d37SBarry Smith 
331715091d37SBarry Smith   /* forward solve the lower triangular */
331815091d37SBarry Smith   idx    = 0;
331915091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
332015091d37SBarry Smith   for (i=1; i<n; i++) {
332115091d37SBarry Smith     v     =  aa      + 4*ai[i];
332215091d37SBarry Smith     vi    =  aj      + ai[i];
332315091d37SBarry Smith     nz    =  diag[i] - ai[i];
332415091d37SBarry Smith     idx   +=  2;
3325f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
332615091d37SBarry Smith     while (nz--) {
332715091d37SBarry Smith       jdx   = 2*(*vi++);
332815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
3329f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3330f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
333115091d37SBarry Smith       v    += 4;
333215091d37SBarry Smith     }
3333f1af5d2fSBarry Smith     x[idx]   = s1;
3334f1af5d2fSBarry Smith     x[1+idx] = s2;
333515091d37SBarry Smith   }
333615091d37SBarry Smith   /* backward solve the upper triangular */
333715091d37SBarry Smith   for (i=n-1; i>=0; i--){
333815091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
333915091d37SBarry Smith     vi   = aj + diag[i] + 1;
334015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
334115091d37SBarry Smith     idt  = 2*i;
3342f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
334315091d37SBarry Smith     while (nz--) {
334415091d37SBarry Smith       idx   = 2*(*vi++);
334515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
3346f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3347f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
334815091d37SBarry Smith       v    += 4;
334915091d37SBarry Smith     }
335015091d37SBarry Smith     v        = aa +  4*diag[i];
3351f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
3352f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
335315091d37SBarry Smith   }
335415091d37SBarry Smith 
3355d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3357dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
335815091d37SBarry Smith   PetscFunctionReturn(0);
335915091d37SBarry Smith }
336015091d37SBarry Smith 
33614a2ae208SSatish Balay #undef __FUNCT__
3362cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
3363cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3364cee9d6f2SShri Abhyankar {
3365cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3366cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3367cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3368cee9d6f2SShri Abhyankar     PetscInt          jdx;
3369cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3370cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
3371cee9d6f2SShri Abhyankar     const PetscScalar *b;
3372cee9d6f2SShri Abhyankar 
3373cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3374cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3375cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3376cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3377cee9d6f2SShri Abhyankar     idx    = 0;
3378cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
3379cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3380cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
3381cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3382cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3383cee9d6f2SShri Abhyankar        idx  = 2*i;
3384cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
3385cee9d6f2SShri Abhyankar        while (nz--) {
3386cee9d6f2SShri Abhyankar           jdx   = 2*(*vi++);
3387cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
3388cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
3389cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
3390cee9d6f2SShri Abhyankar            v   +=  4;
3391cee9d6f2SShri Abhyankar         }
3392cee9d6f2SShri Abhyankar        x[idx]   = s1;
3393cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3394cee9d6f2SShri Abhyankar     }
3395cee9d6f2SShri Abhyankar 
3396cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3397cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3398cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
3399cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3400cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3401cee9d6f2SShri Abhyankar      idt = 2*i;
3402cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
3403cee9d6f2SShri Abhyankar      while (nz--) {
3404cee9d6f2SShri Abhyankar       idx   = 2*(*vi++);
3405cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
3406cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
3407cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
3408cee9d6f2SShri Abhyankar          v    += 4;
3409cee9d6f2SShri Abhyankar     }
3410cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3411cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
3412cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
3413cee9d6f2SShri Abhyankar   }
3414cee9d6f2SShri Abhyankar 
3415cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3416cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3417cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3418cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3419cee9d6f2SShri Abhyankar }
3420cee9d6f2SShri Abhyankar 
3421cee9d6f2SShri Abhyankar #undef __FUNCT__
34224a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3423dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
34244e2b4712SSatish Balay {
34254e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
34264e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
34276849ba73SBarry Smith   PetscErrorCode ierr;
34285d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
34295d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
34303f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
343187828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
34324e2b4712SSatish Balay 
34334e2b4712SSatish Balay   PetscFunctionBegin;
34344e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
34354e2b4712SSatish Balay 
34361ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
34371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3438f1af5d2fSBarry Smith   t  = a->solve_work;
34394e2b4712SSatish Balay 
34404e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
34414e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
34424e2b4712SSatish Balay 
34434e2b4712SSatish Balay   /* forward solve the lower triangular */
3444f1af5d2fSBarry Smith   t[0] = b[*r++];
34454e2b4712SSatish Balay   for (i=1; i<n; i++) {
34464e2b4712SSatish Balay     v     = aa + ai[i];
34474e2b4712SSatish Balay     vi    = aj + ai[i];
34484e2b4712SSatish Balay     nz    = diag[i] - ai[i];
3449f1af5d2fSBarry Smith     s1  = b[*r++];
34504e2b4712SSatish Balay     while (nz--) {
3451f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
34524e2b4712SSatish Balay     }
3453f1af5d2fSBarry Smith     t[i] = s1;
34544e2b4712SSatish Balay   }
34554e2b4712SSatish Balay   /* backward solve the upper triangular */
34564e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
34574e2b4712SSatish Balay     v    = aa + diag[i] + 1;
34584e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
34594e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3460f1af5d2fSBarry Smith     s1 = t[i];
34614e2b4712SSatish Balay     while (nz--) {
3462f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
34634e2b4712SSatish Balay     }
3464f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
34654e2b4712SSatish Balay   }
34664e2b4712SSatish Balay 
34674e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
34684e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
34691ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
34701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3471dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
34724e2b4712SSatish Balay   PetscFunctionReturn(0);
34734e2b4712SSatish Balay }
347415091d37SBarry Smith /*
347515091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
347615091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
347715091d37SBarry Smith */
34784a2ae208SSatish Balay #undef __FUNCT__
34794a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3480dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
348115091d37SBarry Smith {
348215091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3483690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3484dfbe8321SBarry Smith   PetscErrorCode ierr;
3485690b6cddSBarry Smith   PetscInt       *diag = a->diag;
348615091d37SBarry Smith   MatScalar      *aa=a->a;
348787828ca2SBarry Smith   PetscScalar    *x,*b;
348887828ca2SBarry Smith   PetscScalar    s1,x1;
348915091d37SBarry Smith   MatScalar      *v;
3490690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
349115091d37SBarry Smith 
349215091d37SBarry Smith   PetscFunctionBegin;
34931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
34941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
349515091d37SBarry Smith 
349615091d37SBarry Smith   /* forward solve the lower triangular */
349715091d37SBarry Smith   idx    = 0;
349815091d37SBarry Smith   x[0]   = b[0];
349915091d37SBarry Smith   for (i=1; i<n; i++) {
350015091d37SBarry Smith     v     =  aa      + ai[i];
350115091d37SBarry Smith     vi    =  aj      + ai[i];
350215091d37SBarry Smith     nz    =  diag[i] - ai[i];
350315091d37SBarry Smith     idx   +=  1;
3504f1af5d2fSBarry Smith     s1  =  b[idx];
350515091d37SBarry Smith     while (nz--) {
350615091d37SBarry Smith       jdx   = *vi++;
350715091d37SBarry Smith       x1    = x[jdx];
3508f1af5d2fSBarry Smith       s1 -= v[0]*x1;
350915091d37SBarry Smith       v    += 1;
351015091d37SBarry Smith     }
3511f1af5d2fSBarry Smith     x[idx]   = s1;
351215091d37SBarry Smith   }
351315091d37SBarry Smith   /* backward solve the upper triangular */
351415091d37SBarry Smith   for (i=n-1; i>=0; i--){
351515091d37SBarry Smith     v    = aa + diag[i] + 1;
351615091d37SBarry Smith     vi   = aj + diag[i] + 1;
351715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
351815091d37SBarry Smith     idt  = i;
3519f1af5d2fSBarry Smith     s1 = x[idt];
352015091d37SBarry Smith     while (nz--) {
352115091d37SBarry Smith       idx   = *vi++;
352215091d37SBarry Smith       x1    = x[idx];
3523f1af5d2fSBarry Smith       s1 -= v[0]*x1;
352415091d37SBarry Smith       v    += 1;
352515091d37SBarry Smith     }
352615091d37SBarry Smith     v        = aa +  diag[i];
3527f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
352815091d37SBarry Smith   }
35291ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
35301ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3531dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
353215091d37SBarry Smith   PetscFunctionReturn(0);
353315091d37SBarry Smith }
35344e2b4712SSatish Balay 
35354e2b4712SSatish Balay /* ----------------------------------------------------------------*/
353616a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
35376bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
35386bce7ff8SHong Zhang 
353984a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
35406bce7ff8SHong Zhang #undef __FUNCT__
35416bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
35426bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
35436bce7ff8SHong Zhang {
35446bce7ff8SHong Zhang   Mat            C=B;
35456bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
35466bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
35476bce7ff8SHong Zhang   PetscErrorCode ierr;
35486bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
35496bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
35506bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3551914a18a2SHong Zhang   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3552914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3553914a18a2SHong Zhang   MatScalar      *v_work;
35546bce7ff8SHong Zhang 
35556bce7ff8SHong Zhang   PetscFunctionBegin;
35566bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35576bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3558914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3559914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
35606bce7ff8SHong Zhang   ics  = ic;
35616bce7ff8SHong Zhang 
3562914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
3563914a18a2SHong Zhang   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3564914a18a2SHong Zhang   multiplier = v_work + bs;
3565914a18a2SHong Zhang   v_pivots   = (PetscInt*)(multiplier + bs2);
3566914a18a2SHong Zhang 
35676bce7ff8SHong Zhang   for (i=0; i<n; i++){
35686bce7ff8SHong Zhang     /* zero rtmp */
35696bce7ff8SHong Zhang     /* L part */
35706bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
35716bce7ff8SHong Zhang     bjtmp = bj + bi[i];
3572914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3573914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3574914a18a2SHong Zhang     }
35756bce7ff8SHong Zhang 
35766bce7ff8SHong Zhang     /* U part */
35776bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
35786bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
3579914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3580914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3581914a18a2SHong Zhang     }
35826bce7ff8SHong Zhang 
35836bce7ff8SHong Zhang     /* load in initial (unfactored row) */
35846bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
35856bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
3586914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
35876bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3588914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
35896bce7ff8SHong Zhang     }
35906bce7ff8SHong Zhang 
35916bce7ff8SHong Zhang     /* elimination */
35926bce7ff8SHong Zhang     bjtmp = bj + bi[i];
35936bce7ff8SHong Zhang     row   = *bjtmp++;
35946bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
35956bce7ff8SHong Zhang     k   = 0;
35966bce7ff8SHong Zhang     while  (k < nzL) {
3597914a18a2SHong Zhang       pc = rtmp + bs2*row;
3598914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3599914a18a2SHong Zhang       if (flg) {
3600914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
3601914a18a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
36026bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3603914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
36046bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3605914a18a2SHong Zhang         for (j=0; j<nz; j++) {
3606914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3607914a18a2SHong Zhang         }
36086bce7ff8SHong Zhang         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
36096bce7ff8SHong Zhang       }
36106bce7ff8SHong Zhang       row = *bjtmp++; k++;
36116bce7ff8SHong Zhang     }
36126bce7ff8SHong Zhang 
36136bce7ff8SHong Zhang     /* finished row so stick it into b->a */
36146bce7ff8SHong Zhang     /* L part */
3615914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
36166bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
36176bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
36186bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3619914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
36206bce7ff8SHong Zhang     }
36216bce7ff8SHong Zhang 
36226bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
3623914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
36246bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
3625914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3626914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3627914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
36286bce7ff8SHong Zhang 
36296bce7ff8SHong Zhang     /* U part */
3630914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
36316bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
36326bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3633914a18a2SHong Zhang     for (j=0; j<nz; j++){
3634914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3635914a18a2SHong Zhang     }
36366bce7ff8SHong Zhang   }
36376bce7ff8SHong Zhang 
36386bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
36396bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
36406bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
36416bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
364227019359SHong Zhang 
36436bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
3644914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
36456bce7ff8SHong Zhang   PetscFunctionReturn(0);
36466bce7ff8SHong Zhang }
36476bce7ff8SHong Zhang 
36486bce7ff8SHong Zhang /*
36496bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
365016a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
365116a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
36526bce7ff8SHong Zhang */
36536bce7ff8SHong Zhang #undef __FUNCT__
36546bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
36556bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
36566bce7ff8SHong Zhang {
36576bce7ff8SHong Zhang 
36586bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
36596bce7ff8SHong Zhang   PetscErrorCode     ierr;
366016a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
366116a2bf60SHong Zhang   PetscInt           i,j,nz,*bi,*bj,*bdiag;
36626bce7ff8SHong Zhang 
36636bce7ff8SHong Zhang   PetscFunctionBegin;
366416a2bf60SHong Zhang   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
366516a2bf60SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
36666bce7ff8SHong Zhang   b    = (Mat_SeqBAIJ*)(fact)->data;
366716a2bf60SHong Zhang 
366816a2bf60SHong Zhang   /* allocate matrix arrays for new data structure */
366916a2bf60SHong Zhang   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
367016a2bf60SHong Zhang   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
367116a2bf60SHong Zhang   b->singlemalloc = PETSC_TRUE;
367216a2bf60SHong Zhang   if (!b->diag){
367316a2bf60SHong Zhang     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
367416a2bf60SHong Zhang   }
3675914a18a2SHong Zhang   bdiag = b->diag;
36766bce7ff8SHong Zhang 
367716a2bf60SHong Zhang   if (n > 0) {
367816a2bf60SHong Zhang     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
36796bce7ff8SHong Zhang   }
36806bce7ff8SHong Zhang 
36816bce7ff8SHong Zhang   /* set bi and bj with new data structure */
36826bce7ff8SHong Zhang   bi = b->i;
36836bce7ff8SHong Zhang   bj = b->j;
36846bce7ff8SHong Zhang 
36856bce7ff8SHong Zhang   /* L part */
36866bce7ff8SHong Zhang   bi[0] = 0;
368716a2bf60SHong Zhang   for (i=0; i<n; i++){
36886bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
3689914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
36906bce7ff8SHong Zhang     aj = a->j + ai[i];
36916bce7ff8SHong Zhang     for (j=0; j<nz; j++){
36926bce7ff8SHong Zhang       *bj = aj[j]; bj++;
36936bce7ff8SHong Zhang     }
36946bce7ff8SHong Zhang   }
36956bce7ff8SHong Zhang 
36966bce7ff8SHong Zhang   /* U part */
369716a2bf60SHong Zhang   bi[n+1] = bi[n];
369816a2bf60SHong Zhang   for (i=n-1; i>=0; i--){
36996bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
370016a2bf60SHong Zhang     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
37016bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
37026bce7ff8SHong Zhang     for (j=0; j<nz; j++){
37036bce7ff8SHong Zhang       *bj = aj[j]; bj++;
37046bce7ff8SHong Zhang     }
37056bce7ff8SHong Zhang     /* diag[i] */
37066bce7ff8SHong Zhang     *bj = i; bj++;
370716a2bf60SHong Zhang     bdiag[i] = bi[2*n-i+1]-1;
37086bce7ff8SHong Zhang   }
37096bce7ff8SHong Zhang   PetscFunctionReturn(0);
37106bce7ff8SHong Zhang }
37116bce7ff8SHong Zhang 
371216a2bf60SHong Zhang extern PetscErrorCode PetscFreeSpaceContiguous_newdatastruct(PetscFreeSpaceList *,PetscInt *,PetscInt,PetscInt *,PetscInt *);
371316a2bf60SHong Zhang #undef __FUNCT__
371416a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
371516a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
371616a2bf60SHong Zhang {
371716a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
371816a2bf60SHong Zhang   IS                 isicol;
371916a2bf60SHong Zhang   PetscErrorCode     ierr;
372016a2bf60SHong Zhang   const PetscInt     *r,*ic;
3721*7fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
372216a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
372316a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
372416a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
3725*7fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
372616a2bf60SHong Zhang   PetscReal          f;
372716a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
372816a2bf60SHong Zhang   PetscBT            lnkbt;
372916a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
373016a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
373116a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
373216a2bf60SHong Zhang   PetscTruth         missing;
3733*7fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
373416a2bf60SHong Zhang 
373516a2bf60SHong Zhang   PetscFunctionBegin;
373616a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
373716a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
373816a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
373916a2bf60SHong Zhang 
374016a2bf60SHong Zhang   f             = info->fill;
374116a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
374216a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
374316a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
374416a2bf60SHong Zhang 
374516a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
374616a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
3747*7fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
374816a2bf60SHong Zhang 
3749*7fa3a6a0SHong Zhang   if (!levels && both_identity) {
375016a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
375116a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
375216a2bf60SHong Zhang     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3753*7fa3a6a0SHong Zhang     /* set MatSolve routines */
3754*7fa3a6a0SHong Zhang     switch (bs){
3755*7fa3a6a0SHong Zhang     case 2:
3756*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
3757*7fa3a6a0SHong Zhang       break;
3758*7fa3a6a0SHong Zhang     case 3:
3759*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
3760*7fa3a6a0SHong Zhang       break;
3761*7fa3a6a0SHong Zhang     case 4:
3762*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
3763*7fa3a6a0SHong Zhang       break;
3764*7fa3a6a0SHong Zhang     case 5:
3765*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3766*7fa3a6a0SHong Zhang       break;
3767*7fa3a6a0SHong Zhang     case 6:
3768*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
3769*7fa3a6a0SHong Zhang       break;
3770*7fa3a6a0SHong Zhang     case 7:
3771*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
3772*7fa3a6a0SHong Zhang       break;
3773*7fa3a6a0SHong Zhang     default:
3774*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3775*7fa3a6a0SHong Zhang       break;
3776*7fa3a6a0SHong Zhang     }
377716a2bf60SHong Zhang 
377816a2bf60SHong Zhang     fact->factor = MAT_FACTOR_ILU;
377916a2bf60SHong Zhang     (fact)->info.factor_mallocs    = 0;
378016a2bf60SHong Zhang     (fact)->info.fill_ratio_given  = info->fill;
378116a2bf60SHong Zhang     (fact)->info.fill_ratio_needed = 1.0;
378216a2bf60SHong Zhang     b                = (Mat_SeqBAIJ*)(fact)->data;
378316a2bf60SHong Zhang     b->row           = isrow;
378416a2bf60SHong Zhang     b->col           = iscol;
378516a2bf60SHong Zhang     b->icol          = isicol;
378616a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
378716a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
378816a2bf60SHong Zhang     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
378916a2bf60SHong Zhang     ierr = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
379016a2bf60SHong Zhang     PetscFunctionReturn(0);
379116a2bf60SHong Zhang   }
379216a2bf60SHong Zhang 
379316a2bf60SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
379416a2bf60SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
379516a2bf60SHong Zhang 
379616a2bf60SHong Zhang   /* get new row pointers */
379716a2bf60SHong Zhang   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
379816a2bf60SHong Zhang   bi[0] = 0;
379916a2bf60SHong Zhang   /* bdiag is location of diagonal in factor */
380016a2bf60SHong Zhang   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
380116a2bf60SHong Zhang   bdiag[0]  = 0;
380216a2bf60SHong Zhang 
380316a2bf60SHong Zhang   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
380416a2bf60SHong Zhang   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
380516a2bf60SHong Zhang 
380616a2bf60SHong Zhang   /* create a linked list for storing column indices of the active row */
380716a2bf60SHong Zhang   nlnk = n + 1;
380816a2bf60SHong Zhang   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
380916a2bf60SHong Zhang 
381016a2bf60SHong Zhang   /* initial FreeSpace size is f*(ai[n]+1) */
381116a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
381216a2bf60SHong Zhang   current_space = free_space;
381316a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
381416a2bf60SHong Zhang   current_space_lvl = free_space_lvl;
381516a2bf60SHong Zhang 
381616a2bf60SHong Zhang   for (i=0; i<n; i++) {
381716a2bf60SHong Zhang     nzi = 0;
381816a2bf60SHong Zhang     /* copy current row into linked list */
381916a2bf60SHong Zhang     nnz  = ai[r[i]+1] - ai[r[i]];
382016a2bf60SHong Zhang     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
382116a2bf60SHong Zhang     cols = aj + ai[r[i]];
382216a2bf60SHong Zhang     lnk[i] = -1; /* marker to indicate if diagonal exists */
382316a2bf60SHong Zhang     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
382416a2bf60SHong Zhang     nzi += nlnk;
382516a2bf60SHong Zhang 
382616a2bf60SHong Zhang     /* make sure diagonal entry is included */
382716a2bf60SHong Zhang     if (diagonal_fill && lnk[i] == -1) {
382816a2bf60SHong Zhang       fm = n;
382916a2bf60SHong Zhang       while (lnk[fm] < i) fm = lnk[fm];
383016a2bf60SHong Zhang       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
383116a2bf60SHong Zhang       lnk[fm]    = i;
383216a2bf60SHong Zhang       lnk_lvl[i] = 0;
383316a2bf60SHong Zhang       nzi++; dcount++;
383416a2bf60SHong Zhang     }
383516a2bf60SHong Zhang 
383616a2bf60SHong Zhang     /* add pivot rows into the active row */
383716a2bf60SHong Zhang     nzbd = 0;
383816a2bf60SHong Zhang     prow = lnk[n];
383916a2bf60SHong Zhang     while (prow < i) {
384016a2bf60SHong Zhang       nnz      = bdiag[prow];
384116a2bf60SHong Zhang       cols     = bj_ptr[prow] + nnz + 1;
384216a2bf60SHong Zhang       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
384316a2bf60SHong Zhang       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
384416a2bf60SHong Zhang       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
384516a2bf60SHong Zhang       nzi += nlnk;
384616a2bf60SHong Zhang       prow = lnk[prow];
384716a2bf60SHong Zhang       nzbd++;
384816a2bf60SHong Zhang     }
384916a2bf60SHong Zhang     bdiag[i] = nzbd;
385016a2bf60SHong Zhang     bi[i+1]  = bi[i] + nzi;
385116a2bf60SHong Zhang 
385216a2bf60SHong Zhang     /* if free space is not available, make more free space */
385316a2bf60SHong Zhang     if (current_space->local_remaining<nzi) {
385416a2bf60SHong Zhang       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
385516a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
385616a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
385716a2bf60SHong Zhang       reallocs++;
385816a2bf60SHong Zhang     }
385916a2bf60SHong Zhang 
386016a2bf60SHong Zhang     /* copy data into free_space and free_space_lvl, then initialize lnk */
386116a2bf60SHong Zhang     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
386216a2bf60SHong Zhang     bj_ptr[i]    = current_space->array;
386316a2bf60SHong Zhang     bjlvl_ptr[i] = current_space_lvl->array;
386416a2bf60SHong Zhang 
386516a2bf60SHong Zhang     /* make sure the active row i has diagonal entry */
386616a2bf60SHong Zhang     if (*(bj_ptr[i]+bdiag[i]) != i) {
386716a2bf60SHong Zhang       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
386816a2bf60SHong Zhang     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
386916a2bf60SHong Zhang     }
387016a2bf60SHong Zhang 
387116a2bf60SHong Zhang     current_space->array           += nzi;
387216a2bf60SHong Zhang     current_space->local_used      += nzi;
387316a2bf60SHong Zhang     current_space->local_remaining -= nzi;
387416a2bf60SHong Zhang     current_space_lvl->array           += nzi;
387516a2bf60SHong Zhang     current_space_lvl->local_used      += nzi;
387616a2bf60SHong Zhang     current_space_lvl->local_remaining -= nzi;
387716a2bf60SHong Zhang   }
387816a2bf60SHong Zhang 
387916a2bf60SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
388016a2bf60SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
388116a2bf60SHong Zhang 
388216a2bf60SHong Zhang   /* destroy list of free space and other temporary arrays */
388316a2bf60SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
388416a2bf60SHong Zhang 
388516a2bf60SHong Zhang   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
388616a2bf60SHong Zhang   ierr = PetscFreeSpaceContiguous_newdatastruct(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
388716a2bf60SHong Zhang 
388816a2bf60SHong Zhang   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
388916a2bf60SHong Zhang   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
389016a2bf60SHong Zhang   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
389116a2bf60SHong Zhang 
389216a2bf60SHong Zhang #if defined(PETSC_USE_INFO)
389316a2bf60SHong Zhang   {
389416a2bf60SHong Zhang     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
389516a2bf60SHong Zhang     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
389616a2bf60SHong Zhang     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
389716a2bf60SHong Zhang     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
389816a2bf60SHong Zhang     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
389916a2bf60SHong Zhang     if (diagonal_fill) {
390016a2bf60SHong Zhang       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
390116a2bf60SHong Zhang     }
390216a2bf60SHong Zhang   }
390316a2bf60SHong Zhang #endif
390416a2bf60SHong Zhang 
390516a2bf60SHong Zhang   /* put together the new matrix */
390616a2bf60SHong Zhang   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
390716a2bf60SHong Zhang   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
390816a2bf60SHong Zhang   b = (Mat_SeqBAIJ*)(fact)->data;
390916a2bf60SHong Zhang   b->free_a       = PETSC_TRUE;
391016a2bf60SHong Zhang   b->free_ij      = PETSC_TRUE;
391116a2bf60SHong Zhang   b->singlemalloc = PETSC_FALSE;
3912*7fa3a6a0SHong Zhang   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
391316a2bf60SHong Zhang   b->j          = bj;
391416a2bf60SHong Zhang   b->i          = bi;
391516a2bf60SHong Zhang   b->diag       = bdiag;
391616a2bf60SHong Zhang   b->ilen       = 0;
391716a2bf60SHong Zhang   b->imax       = 0;
391816a2bf60SHong Zhang   b->row        = isrow;
391916a2bf60SHong Zhang   b->col        = iscol;
392016a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
392116a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
392216a2bf60SHong Zhang   b->icol       = isicol;
3923*7fa3a6a0SHong Zhang   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
392416a2bf60SHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
392516a2bf60SHong Zhang      Allocate bdiag, solve_work, new a, new j */
3926*7fa3a6a0SHong Zhang   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
392716a2bf60SHong Zhang   b->maxnz = b->nz = bi[2*n+1] ;
392816a2bf60SHong Zhang   (fact)->info.factor_mallocs    = reallocs;
392916a2bf60SHong Zhang   (fact)->info.fill_ratio_given  = f;
393016a2bf60SHong Zhang   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
393116a2bf60SHong Zhang   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
3932*7fa3a6a0SHong Zhang   /* set MatSolve routines */
3933*7fa3a6a0SHong Zhang   if (both_identity){
3934*7fa3a6a0SHong Zhang     switch (bs){
3935*7fa3a6a0SHong Zhang     case 2:
3936*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
3937*7fa3a6a0SHong Zhang       break;
3938*7fa3a6a0SHong Zhang     case 3:
3939*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
3940*7fa3a6a0SHong Zhang       break;
3941*7fa3a6a0SHong Zhang     case 4:
3942*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
3943*7fa3a6a0SHong Zhang       break;
3944*7fa3a6a0SHong Zhang     case 5:
3945*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3946*7fa3a6a0SHong Zhang       break;
3947*7fa3a6a0SHong Zhang     case 6:
3948*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
3949*7fa3a6a0SHong Zhang       break;
3950*7fa3a6a0SHong Zhang     case 7:
3951*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
3952*7fa3a6a0SHong Zhang       break;
3953*7fa3a6a0SHong Zhang     default:
3954*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3955*7fa3a6a0SHong Zhang       break;
3956*7fa3a6a0SHong Zhang     }
3957*7fa3a6a0SHong Zhang   } else {
3958*7fa3a6a0SHong Zhang     switch (bs){
3959*7fa3a6a0SHong Zhang       /* not implemented yet!
3960*7fa3a6a0SHong Zhang     case 2:
3961*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
3962*7fa3a6a0SHong Zhang       break;
3963*7fa3a6a0SHong Zhang     case 3:
3964*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
3965*7fa3a6a0SHong Zhang       break;
3966*7fa3a6a0SHong Zhang     case 4:
3967*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
3968*7fa3a6a0SHong Zhang       break;
3969*7fa3a6a0SHong Zhang     case 5:
3970*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
3971*7fa3a6a0SHong Zhang       break;
3972*7fa3a6a0SHong Zhang     case 6:
3973*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
3974*7fa3a6a0SHong Zhang       break;
3975*7fa3a6a0SHong Zhang     case 7:
3976*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
3977*7fa3a6a0SHong Zhang       break;
3978*7fa3a6a0SHong Zhang     default:
3979*7fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
3980*7fa3a6a0SHong Zhang       break;
3981*7fa3a6a0SHong Zhang       */
3982*7fa3a6a0SHong Zhang     }
3983*7fa3a6a0SHong Zhang   }
398416a2bf60SHong Zhang   PetscFunctionReturn(0);
398516a2bf60SHong Zhang }
398616a2bf60SHong Zhang 
39874e2b4712SSatish Balay /*
39884e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
39894e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
39904e2b4712SSatish Balay    Not a good example of code reuse.
39914e2b4712SSatish Balay */
39924a2ae208SSatish Balay #undef __FUNCT__
39934a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
39940481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
39954e2b4712SSatish Balay {
39964e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
39974e2b4712SSatish Balay   IS             isicol;
39986849ba73SBarry Smith   PetscErrorCode ierr;
39995d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
40005d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4001a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4002d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
400341df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
4004329f5518SBarry Smith   PetscReal      f;
400516a2bf60SHong Zhang   PetscTruth     newdatastruct=PETSC_FALSE;
40064e2b4712SSatish Balay 
40074e2b4712SSatish Balay   PetscFunctionBegin;
400816a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
400916a2bf60SHong Zhang   if (newdatastruct){
401016a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
401116a2bf60SHong Zhang     PetscFunctionReturn(0);
401216a2bf60SHong Zhang   }
401316a2bf60SHong Zhang 
40146bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
40156bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
40166bce7ff8SHong Zhang 
4017435faa5fSBarry Smith   f             = info->fill;
4018690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
4019690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
40204c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
402116a2bf60SHong Zhang 
4022667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4023667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
40247d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
4025309c388cSBarry Smith 
402641df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
402716a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
40286bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
40296bce7ff8SHong Zhang 
4030719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
4031719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
4032bb3d539aSBarry Smith     b->row       = isrow;
4033bb3d539aSBarry Smith     b->col       = iscol;
4034bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4035bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4036bb3d539aSBarry Smith     b->icol      = isicol;
4037bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4038719d5645SBarry Smith     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
40396bce7ff8SHong Zhang     PetscFunctionReturn(0);
40406bce7ff8SHong Zhang   }
40416bce7ff8SHong Zhang 
40426bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
40434e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
40444e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
40454e2b4712SSatish Balay 
40464e2b4712SSatish Balay     /* get new row pointers */
4047690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
40484e2b4712SSatish Balay     ainew[0] = 0;
40494e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
4050690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
4051690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
40524e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
4053690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
40544e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
4055690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
40564e2b4712SSatish Balay     /* im is level for each filled value */
4057690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
40584e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
4059690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
40604e2b4712SSatish Balay     dloc[0]  = 0;
40614e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
4062435faa5fSBarry Smith 
4063435faa5fSBarry Smith       /* copy prow into linked list */
40644e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
40653b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
40664e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
40674e2b4712SSatish Balay       fill[n]    = n;
4068435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
40694e2b4712SSatish Balay       while (nz--) {
40704e2b4712SSatish Balay 	fm  = n;
40714e2b4712SSatish Balay 	idx = ic[*xi++];
40724e2b4712SSatish Balay 	do {
40734e2b4712SSatish Balay 	  m  = fm;
40744e2b4712SSatish Balay 	  fm = fill[m];
40754e2b4712SSatish Balay 	} while (fm < idx);
40764e2b4712SSatish Balay 	fill[m]   = idx;
40774e2b4712SSatish Balay 	fill[idx] = fm;
40784e2b4712SSatish Balay 	im[idx]   = 0;
40794e2b4712SSatish Balay       }
4080435faa5fSBarry Smith 
4081435faa5fSBarry Smith       /* make sure diagonal entry is included */
4082435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
4083435faa5fSBarry Smith 	fm = n;
4084435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
4085435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
4086435faa5fSBarry Smith 	fill[fm]   = prow;
4087435faa5fSBarry Smith 	im[prow]   = 0;
4088435faa5fSBarry Smith 	nzf++;
4089335d9088SBarry Smith 	dcount++;
4090435faa5fSBarry Smith       }
4091435faa5fSBarry Smith 
40924e2b4712SSatish Balay       nzi = 0;
40934e2b4712SSatish Balay       row = fill[n];
40944e2b4712SSatish Balay       while (row < prow) {
40954e2b4712SSatish Balay 	incrlev = im[row] + 1;
40964e2b4712SSatish Balay 	nz      = dloc[row];
4097435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
40984e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
40994e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
41004e2b4712SSatish Balay 	fm      = row;
41014e2b4712SSatish Balay 	while (nnz-- > 0) {
41024e2b4712SSatish Balay 	  idx = *xi++;
41034e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
41044e2b4712SSatish Balay 	    flev++;
41054e2b4712SSatish Balay 	    continue;
41064e2b4712SSatish Balay 	  }
41074e2b4712SSatish Balay 	  do {
41084e2b4712SSatish Balay 	    m  = fm;
41094e2b4712SSatish Balay 	    fm = fill[m];
41104e2b4712SSatish Balay 	  } while (fm < idx);
41114e2b4712SSatish Balay 	  if (fm != idx) {
41124e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
41134e2b4712SSatish Balay 	    fill[m]   = idx;
41144e2b4712SSatish Balay 	    fill[idx] = fm;
41154e2b4712SSatish Balay 	    fm        = idx;
41164e2b4712SSatish Balay 	    nzf++;
4117ecf371e4SBarry Smith 	  } else {
41184e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
41194e2b4712SSatish Balay 	  }
41204e2b4712SSatish Balay 	  flev++;
41214e2b4712SSatish Balay 	}
41224e2b4712SSatish Balay 	row = fill[row];
41234e2b4712SSatish Balay 	nzi++;
41244e2b4712SSatish Balay       }
41254e2b4712SSatish Balay       /* copy new filled row into permanent storage */
41264e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
41274e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
4128ecf371e4SBarry Smith 
4129ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
4130ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
4131ecf371e4SBarry Smith 	/* just double the memory each time */
4132690b6cddSBarry Smith 	PetscInt maxadd = jmax;
4133ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
41344e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
41354e2b4712SSatish Balay 	jmax += maxadd;
4136ecf371e4SBarry Smith 
4137ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
41385d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
41395d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4140606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
41415d0c19d7SBarry Smith 	ajnew = xitmp;
41425d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
41435d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4144606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
41455d0c19d7SBarry Smith 	ajfill = xitmp;
4146eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
41474e2b4712SSatish Balay       }
41485d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
41494e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
41504e2b4712SSatish Balay       dloc[prow]  = nzi;
41514e2b4712SSatish Balay       fm          = fill[n];
41524e2b4712SSatish Balay       while (nzf--) {
41535d0c19d7SBarry Smith 	*xitmp++ = fm;
41544e2b4712SSatish Balay 	*flev++ = im[fm];
41554e2b4712SSatish Balay 	fm      = fill[fm];
41564e2b4712SSatish Balay       }
4157435faa5fSBarry Smith       /* make sure row has diagonal entry */
4158435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
415977431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
41602401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
4161435faa5fSBarry Smith       }
41624e2b4712SSatish Balay     }
4163606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
41644e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
41654e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4166606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
4167606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
41684e2b4712SSatish Balay 
41696cf91177SBarry Smith #if defined(PETSC_USE_INFO)
41704e2b4712SSatish Balay     {
4171329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
4172ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
4173ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4174ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
4175ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4176335d9088SBarry Smith       if (diagonal_fill) {
4177ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
4178335d9088SBarry Smith       }
41794e2b4712SSatish Balay     }
418063ba0a88SBarry Smith #endif
41814e2b4712SSatish Balay 
41824e2b4712SSatish Balay     /* put together the new matrix */
4183719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4184719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4185719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
4186e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
4187e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
41887c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
4189a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
41904e2b4712SSatish Balay     b->j          = ajnew;
41914e2b4712SSatish Balay     b->i          = ainew;
41924e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
41934e2b4712SSatish Balay     b->diag       = dloc;
41944e2b4712SSatish Balay     b->ilen       = 0;
41954e2b4712SSatish Balay     b->imax       = 0;
41964e2b4712SSatish Balay     b->row        = isrow;
41974e2b4712SSatish Balay     b->col        = iscol;
4198bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4199c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4200c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4201e51c0b9cSSatish Balay     b->icol       = isicol;
420287828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
42034e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
42044e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
4205719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
42064e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
42074e2b4712SSatish Balay 
4208719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
4209719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
4210719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
42116bce7ff8SHong Zhang 
421241df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
42138661488fSKris Buschelman   PetscFunctionReturn(0);
42148661488fSKris Buschelman }
42158661488fSKris Buschelman 
4216732ee342SKris Buschelman #undef __FUNCT__
42177e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
4218dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
42197e7071cdSKris Buschelman {
422012272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
422112272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
42225a9542e3SKris Buschelman   PetscFunctionBegin;
42237cf1b8d3SKris Buschelman   /* Undo Column scaling */
42247cf1b8d3SKris Buschelman /*    while (nz--) { */
42257cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
42267cf1b8d3SKris Buschelman /*    } */
4227c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
4228c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
42297cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
42307cf1b8d3SKris Buschelman }
42317cf1b8d3SKris Buschelman 
42327cf1b8d3SKris Buschelman #undef __FUNCT__
42337cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
4234dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
42357cf1b8d3SKris Buschelman {
42367cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4237b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
42382aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
42395a9542e3SKris Buschelman   PetscFunctionBegin;
42400b9da03eSKris Buschelman   /* Is this really necessary? */
424120235379SKris Buschelman   while (nz--) {
42420b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
42437e7071cdSKris Buschelman   }
4244c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
42457e7071cdSKris Buschelman   PetscFunctionReturn(0);
42467e7071cdSKris Buschelman }
42477e7071cdSKris Buschelman 
4248732ee342SKris Buschelman 
4249