xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 6506fda5ae74552162d38af228d71d0c0dfd82b5)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120f1af5d2fSBarry Smith {
121f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122dfbe8321SBarry Smith   PetscErrorCode ierr;
123690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
125f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12787828ca2SBarry Smith   PetscScalar    *x,*b;
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   PetscFunctionBegin;
130ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith   /* forward solve the U^T */
135f1af5d2fSBarry Smith   idx = 0;
136f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
139f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
140ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144f1af5d2fSBarry Smith     v += 9;
145f1af5d2fSBarry Smith 
146f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
147f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
148f1af5d2fSBarry Smith     while (nz--) {
149f1af5d2fSBarry Smith       oidx = 3*(*vi++);
150f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153f1af5d2fSBarry Smith       v  += 9;
154f1af5d2fSBarry Smith     }
155f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156f1af5d2fSBarry Smith     idx += 3;
157f1af5d2fSBarry Smith   }
158f1af5d2fSBarry Smith   /* backward solve the L^T */
159f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
160f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
161f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
162f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
163f1af5d2fSBarry Smith     idt  = 3*i;
164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165f1af5d2fSBarry Smith     while (nz--) {
166f1af5d2fSBarry Smith       idx   = 3*(*vi--);
167f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170f1af5d2fSBarry Smith       v -= 9;
171f1af5d2fSBarry Smith     }
172f1af5d2fSBarry Smith   }
1731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176f1af5d2fSBarry Smith   PetscFunctionReturn(0);
177f1af5d2fSBarry Smith }
178f1af5d2fSBarry Smith 
1794a2ae208SSatish Balay #undef __FUNCT__
1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182f1af5d2fSBarry Smith {
183f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184dfbe8321SBarry Smith   PetscErrorCode ierr;
185690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
187f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18987828ca2SBarry Smith   PetscScalar    *x,*b;
190f1af5d2fSBarry Smith 
191f1af5d2fSBarry Smith   PetscFunctionBegin;
192ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith   /* forward solve the U^T */
197f1af5d2fSBarry Smith   idx = 0;
198f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
199f1af5d2fSBarry Smith 
200f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
201f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
202ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207f1af5d2fSBarry Smith     v += 16;
208f1af5d2fSBarry Smith 
209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
211f1af5d2fSBarry Smith     while (nz--) {
212f1af5d2fSBarry Smith       oidx = 4*(*vi++);
213f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217f1af5d2fSBarry Smith       v  += 16;
218f1af5d2fSBarry Smith     }
219f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220f1af5d2fSBarry Smith     idx += 4;
221f1af5d2fSBarry Smith   }
222f1af5d2fSBarry Smith   /* backward solve the L^T */
223f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
224f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
225f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
226f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
227f1af5d2fSBarry Smith     idt  = 4*i;
228f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229f1af5d2fSBarry Smith     while (nz--) {
230f1af5d2fSBarry Smith       idx   = 4*(*vi--);
231f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235f1af5d2fSBarry Smith       v -= 16;
236f1af5d2fSBarry Smith     }
237f1af5d2fSBarry Smith   }
2381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241f1af5d2fSBarry Smith   PetscFunctionReturn(0);
242f1af5d2fSBarry Smith }
243f1af5d2fSBarry Smith 
2444a2ae208SSatish Balay #undef __FUNCT__
2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247f1af5d2fSBarry Smith {
248f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249dfbe8321SBarry Smith   PetscErrorCode ierr;
250690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
252f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25487828ca2SBarry Smith   PetscScalar    *x,*b;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith   PetscFunctionBegin;
257ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2581ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2591ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith   /* forward solve the U^T */
262f1af5d2fSBarry Smith   idx = 0;
263f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
266f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
267ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273f1af5d2fSBarry Smith     v += 25;
274f1af5d2fSBarry Smith 
275f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
276f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
277f1af5d2fSBarry Smith     while (nz--) {
278f1af5d2fSBarry Smith       oidx = 5*(*vi++);
279f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284f1af5d2fSBarry Smith       v  += 25;
285f1af5d2fSBarry Smith     }
286f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287f1af5d2fSBarry Smith     idx += 5;
288f1af5d2fSBarry Smith   }
289f1af5d2fSBarry Smith   /* backward solve the L^T */
290f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
291f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
292f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
293f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
294f1af5d2fSBarry Smith     idt  = 5*i;
295f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296f1af5d2fSBarry Smith     while (nz--) {
297f1af5d2fSBarry Smith       idx   = 5*(*vi--);
298f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303f1af5d2fSBarry Smith       v -= 25;
304f1af5d2fSBarry Smith     }
305f1af5d2fSBarry Smith   }
3061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309f1af5d2fSBarry Smith   PetscFunctionReturn(0);
310f1af5d2fSBarry Smith }
311f1af5d2fSBarry Smith 
3124a2ae208SSatish Balay #undef __FUNCT__
3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315f1af5d2fSBarry Smith {
316f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317dfbe8321SBarry Smith   PetscErrorCode ierr;
318690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
320f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
32187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32287828ca2SBarry Smith   PetscScalar    *x,*b;
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith   PetscFunctionBegin;
325ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith   /* forward solve the U^T */
330f1af5d2fSBarry Smith   idx = 0;
331f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
332f1af5d2fSBarry Smith 
333f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
334f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
335ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336ef66eb69SBarry Smith     x6    = x[5+idx];
337f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343f1af5d2fSBarry Smith     v += 36;
344f1af5d2fSBarry Smith 
345f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
346f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
347f1af5d2fSBarry Smith     while (nz--) {
348f1af5d2fSBarry Smith       oidx = 6*(*vi++);
349f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355f1af5d2fSBarry Smith       v  += 36;
356f1af5d2fSBarry Smith     }
357f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358f1af5d2fSBarry Smith     x[5+idx] = s6;
359f1af5d2fSBarry Smith     idx += 6;
360f1af5d2fSBarry Smith   }
361f1af5d2fSBarry Smith   /* backward solve the L^T */
362f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
363f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
364f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
365f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
366f1af5d2fSBarry Smith     idt  = 6*i;
367f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368f1af5d2fSBarry Smith     s6 = x[5+idt];
369f1af5d2fSBarry Smith     while (nz--) {
370f1af5d2fSBarry Smith       idx   = 6*(*vi--);
371f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377f1af5d2fSBarry Smith       v -= 36;
378f1af5d2fSBarry Smith     }
379f1af5d2fSBarry Smith   }
3801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383f1af5d2fSBarry Smith   PetscFunctionReturn(0);
384f1af5d2fSBarry Smith }
385f1af5d2fSBarry Smith 
3864a2ae208SSatish Balay #undef __FUNCT__
3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389f1af5d2fSBarry Smith {
390f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391dfbe8321SBarry Smith   PetscErrorCode ierr;
392690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
394f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39687828ca2SBarry Smith   PetscScalar    *x,*b;
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith   PetscFunctionBegin;
399ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith   /* forward solve the U^T */
404f1af5d2fSBarry Smith   idx = 0;
405f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
406f1af5d2fSBarry Smith 
407f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
408f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
409ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
411f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418f1af5d2fSBarry Smith     v += 49;
419f1af5d2fSBarry Smith 
420f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
421f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
422f1af5d2fSBarry Smith     while (nz--) {
423f1af5d2fSBarry Smith       oidx = 7*(*vi++);
424f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431f1af5d2fSBarry Smith       v  += 49;
432f1af5d2fSBarry Smith     }
433f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
435f1af5d2fSBarry Smith     idx += 7;
436f1af5d2fSBarry Smith   }
437f1af5d2fSBarry Smith   /* backward solve the L^T */
438f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
439f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
440f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
441f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
442f1af5d2fSBarry Smith     idt  = 7*i;
443f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
445f1af5d2fSBarry Smith     while (nz--) {
446f1af5d2fSBarry Smith       idx   = 7*(*vi--);
447f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454f1af5d2fSBarry Smith       v -= 49;
455f1af5d2fSBarry Smith     }
456f1af5d2fSBarry Smith   }
4571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460f1af5d2fSBarry Smith   PetscFunctionReturn(0);
461f1af5d2fSBarry Smith }
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4644a2ae208SSatish Balay #undef __FUNCT__
4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467f1af5d2fSBarry Smith {
468f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
4715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473690b6cddSBarry Smith   PetscInt       *diag = a->diag;
474f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47587828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   PetscFunctionBegin;
4781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480f1af5d2fSBarry Smith   t  = a->solve_work;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith     t[i] = b[c[i]];
488f1af5d2fSBarry Smith   }
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith   /* forward solve the U^T */
491f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith     v     = aa + diag[i];
494f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
495f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith     t[i]   = s1;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     s1   = t[i];
509f1af5d2fSBarry Smith     while (nz--) {
510f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   /* copy t into x according to permutation */
515f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
516f1af5d2fSBarry Smith     x[r[i]]   = t[i];
517f1af5d2fSBarry Smith   }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5211ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   PetscFunctionReturn(0);
525f1af5d2fSBarry Smith }
526f1af5d2fSBarry Smith 
5274a2ae208SSatish Balay #undef __FUNCT__
5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530f1af5d2fSBarry Smith {
531f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5336849ba73SBarry Smith   PetscErrorCode ierr;
5345d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53887828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   PetscFunctionBegin;
5421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544f1af5d2fSBarry Smith   t  = a->solve_work;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
550f1af5d2fSBarry Smith   ii = 0;
551f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
552f1af5d2fSBarry Smith     ic      = 2*c[i];
553f1af5d2fSBarry Smith     t[ii]   = b[ic];
554f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
555f1af5d2fSBarry Smith     ii += 2;
556f1af5d2fSBarry Smith   }
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith   /* forward solve the U^T */
559f1af5d2fSBarry Smith   idx = 0;
560f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
561f1af5d2fSBarry Smith 
562f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
563f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
564f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
565f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
566f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
567f1af5d2fSBarry Smith     v += 4;
568f1af5d2fSBarry Smith 
569f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
570f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
571f1af5d2fSBarry Smith     while (nz--) {
572f1af5d2fSBarry Smith       oidx = 2*(*vi++);
573f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575f1af5d2fSBarry Smith       v  += 4;
576f1af5d2fSBarry Smith     }
577f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
578f1af5d2fSBarry Smith     idx += 2;
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith   /* backward solve the L^T */
581f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
582f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
583f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
584f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
585f1af5d2fSBarry Smith     idt  = 2*i;
586f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
587f1af5d2fSBarry Smith     while (nz--) {
588f1af5d2fSBarry Smith       idx   = 2*(*vi--);
589f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591f1af5d2fSBarry Smith       v -= 4;
592f1af5d2fSBarry Smith     }
593f1af5d2fSBarry Smith   }
594f1af5d2fSBarry Smith 
595f1af5d2fSBarry Smith   /* copy t into x according to permutation */
596f1af5d2fSBarry Smith   ii = 0;
597f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
598f1af5d2fSBarry Smith     ir      = 2*r[i];
599f1af5d2fSBarry Smith     x[ir]   = t[ii];
600f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
601f1af5d2fSBarry Smith     ii += 2;
602f1af5d2fSBarry Smith   }
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609f1af5d2fSBarry Smith   PetscFunctionReturn(0);
610f1af5d2fSBarry Smith }
611f1af5d2fSBarry Smith 
6124a2ae208SSatish Balay #undef __FUNCT__
6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615f1af5d2fSBarry Smith {
616f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6186849ba73SBarry Smith   PetscErrorCode ierr;
6195d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   PetscFunctionBegin;
6271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629f1af5d2fSBarry Smith   t  = a->solve_work;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
635f1af5d2fSBarry Smith   ii = 0;
636f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
637f1af5d2fSBarry Smith     ic      = 3*c[i];
638f1af5d2fSBarry Smith     t[ii]   = b[ic];
639f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
640f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
641f1af5d2fSBarry Smith     ii += 3;
642f1af5d2fSBarry Smith   }
643f1af5d2fSBarry Smith 
644f1af5d2fSBarry Smith   /* forward solve the U^T */
645f1af5d2fSBarry Smith   idx = 0;
646f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
647f1af5d2fSBarry Smith 
648f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
649f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
650f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654f1af5d2fSBarry Smith     v += 9;
655f1af5d2fSBarry Smith 
656f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
657f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
658f1af5d2fSBarry Smith     while (nz--) {
659f1af5d2fSBarry Smith       oidx = 3*(*vi++);
660f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663f1af5d2fSBarry Smith       v  += 9;
664f1af5d2fSBarry Smith     }
665f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666f1af5d2fSBarry Smith     idx += 3;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 3*i;
674f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 3*(*vi--);
677f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680f1af5d2fSBarry Smith       v -= 9;
681f1af5d2fSBarry Smith     }
682f1af5d2fSBarry Smith   }
683f1af5d2fSBarry Smith 
684f1af5d2fSBarry Smith   /* copy t into x according to permutation */
685f1af5d2fSBarry Smith   ii = 0;
686f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
687f1af5d2fSBarry Smith     ir      = 3*r[i];
688f1af5d2fSBarry Smith     x[ir]   = t[ii];
689f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
690f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
691f1af5d2fSBarry Smith     ii += 3;
692f1af5d2fSBarry Smith   }
693f1af5d2fSBarry Smith 
694f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699f1af5d2fSBarry Smith   PetscFunctionReturn(0);
700f1af5d2fSBarry Smith }
701f1af5d2fSBarry Smith 
7024a2ae208SSatish Balay #undef __FUNCT__
7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705f1af5d2fSBarry Smith {
706f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7086849ba73SBarry Smith   PetscErrorCode ierr;
7095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   PetscFunctionBegin;
7171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719f1af5d2fSBarry Smith   t  = a->solve_work;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
725f1af5d2fSBarry Smith   ii = 0;
726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
727f1af5d2fSBarry Smith     ic      = 4*c[i];
728f1af5d2fSBarry Smith     t[ii]   = b[ic];
729f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
730f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
731f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
732f1af5d2fSBarry Smith     ii += 4;
733f1af5d2fSBarry Smith   }
734f1af5d2fSBarry Smith 
735f1af5d2fSBarry Smith   /* forward solve the U^T */
736f1af5d2fSBarry Smith   idx = 0;
737f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
738f1af5d2fSBarry Smith 
739f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
740f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
741f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746f1af5d2fSBarry Smith     v += 16;
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
749f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       oidx = 4*(*vi++);
752f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v  += 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759f1af5d2fSBarry Smith     idx += 4;
760f1af5d2fSBarry Smith   }
761f1af5d2fSBarry Smith   /* backward solve the L^T */
762f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
763f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
764f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
765f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
766f1af5d2fSBarry Smith     idt  = 4*i;
767f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768f1af5d2fSBarry Smith     while (nz--) {
769f1af5d2fSBarry Smith       idx   = 4*(*vi--);
770f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774f1af5d2fSBarry Smith       v -= 16;
775f1af5d2fSBarry Smith     }
776f1af5d2fSBarry Smith   }
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   /* copy t into x according to permutation */
779f1af5d2fSBarry Smith   ii = 0;
780f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
781f1af5d2fSBarry Smith     ir      = 4*r[i];
782f1af5d2fSBarry Smith     x[ir]   = t[ii];
783f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
784f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
785f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
786f1af5d2fSBarry Smith     ii += 4;
787f1af5d2fSBarry Smith   }
788f1af5d2fSBarry Smith 
789f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   PetscFunctionReturn(0);
795f1af5d2fSBarry Smith }
796f1af5d2fSBarry Smith 
7974a2ae208SSatish Balay #undef __FUNCT__
7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800f1af5d2fSBarry Smith {
801f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8036849ba73SBarry Smith   PetscErrorCode ierr;
8045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   PetscFunctionBegin;
8121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814f1af5d2fSBarry Smith   t  = a->solve_work;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
820f1af5d2fSBarry Smith   ii = 0;
821f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
822f1af5d2fSBarry Smith     ic      = 5*c[i];
823f1af5d2fSBarry Smith     t[ii]   = b[ic];
824f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
825f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
826f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
827f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
828f1af5d2fSBarry Smith     ii += 5;
829f1af5d2fSBarry Smith   }
830f1af5d2fSBarry Smith 
831f1af5d2fSBarry Smith   /* forward solve the U^T */
832f1af5d2fSBarry Smith   idx = 0;
833f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
834f1af5d2fSBarry Smith 
835f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
836f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
837f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843f1af5d2fSBarry Smith     v += 25;
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
846f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       oidx = 5*(*vi++);
849f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v  += 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857f1af5d2fSBarry Smith     idx += 5;
858f1af5d2fSBarry Smith   }
859f1af5d2fSBarry Smith   /* backward solve the L^T */
860f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
861f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
862f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
863f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
864f1af5d2fSBarry Smith     idt  = 5*i;
865f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866f1af5d2fSBarry Smith     while (nz--) {
867f1af5d2fSBarry Smith       idx   = 5*(*vi--);
868f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873f1af5d2fSBarry Smith       v -= 25;
874f1af5d2fSBarry Smith     }
875f1af5d2fSBarry Smith   }
876f1af5d2fSBarry Smith 
877f1af5d2fSBarry Smith   /* copy t into x according to permutation */
878f1af5d2fSBarry Smith   ii = 0;
879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
880f1af5d2fSBarry Smith     ir      = 5*r[i];
881f1af5d2fSBarry Smith     x[ir]   = t[ii];
882f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
883f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
884f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
885f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
886f1af5d2fSBarry Smith     ii += 5;
887f1af5d2fSBarry Smith   }
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894f1af5d2fSBarry Smith   PetscFunctionReturn(0);
895f1af5d2fSBarry Smith }
896f1af5d2fSBarry Smith 
8974a2ae208SSatish Balay #undef __FUNCT__
8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900f1af5d2fSBarry Smith {
901f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9036849ba73SBarry Smith   PetscErrorCode ierr;
9045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   PetscFunctionBegin;
9121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914f1af5d2fSBarry Smith   t  = a->solve_work;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
920f1af5d2fSBarry Smith   ii = 0;
921f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
922f1af5d2fSBarry Smith     ic      = 6*c[i];
923f1af5d2fSBarry Smith     t[ii]   = b[ic];
924f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
925f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
926f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
927f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
928f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
929f1af5d2fSBarry Smith     ii += 6;
930f1af5d2fSBarry Smith   }
931f1af5d2fSBarry Smith 
932f1af5d2fSBarry Smith   /* forward solve the U^T */
933f1af5d2fSBarry Smith   idx = 0;
934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
937f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
938f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939f1af5d2fSBarry Smith     x6    = t[5+idx];
940f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946f1af5d2fSBarry Smith     v += 36;
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
949f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       oidx = 6*(*vi++);
952f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v  += 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961f1af5d2fSBarry Smith     t[5+idx] = s6;
962f1af5d2fSBarry Smith     idx += 6;
963f1af5d2fSBarry Smith   }
964f1af5d2fSBarry Smith   /* backward solve the L^T */
965f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
966f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
967f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
968f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
969f1af5d2fSBarry Smith     idt  = 6*i;
970f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971f1af5d2fSBarry Smith     s6 = t[5+idt];
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       idx   = 6*(*vi--);
974f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980f1af5d2fSBarry Smith       v -= 36;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith   }
983f1af5d2fSBarry Smith 
984f1af5d2fSBarry Smith   /* copy t into x according to permutation */
985f1af5d2fSBarry Smith   ii = 0;
986f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
987f1af5d2fSBarry Smith     ir      = 6*r[i];
988f1af5d2fSBarry Smith     x[ir]   = t[ii];
989f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
990f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
991f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
992f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
993f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
994f1af5d2fSBarry Smith     ii += 6;
995f1af5d2fSBarry Smith   }
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1003f1af5d2fSBarry Smith }
1004f1af5d2fSBarry Smith 
10054a2ae208SSatish Balay #undef __FUNCT__
10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008f1af5d2fSBarry Smith {
1009f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10116849ba73SBarry Smith   PetscErrorCode ierr;
10125d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   PetscFunctionBegin;
10201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022f1af5d2fSBarry Smith   t  = a->solve_work;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1028f1af5d2fSBarry Smith   ii = 0;
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     ic      = 7*c[i];
1031f1af5d2fSBarry Smith     t[ii]   = b[ic];
1032f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1033f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1034f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1035f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1036f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1037f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1038f1af5d2fSBarry Smith     ii += 7;
1039f1af5d2fSBarry Smith   }
1040f1af5d2fSBarry Smith 
1041f1af5d2fSBarry Smith   /* forward solve the U^T */
1042f1af5d2fSBarry Smith   idx = 0;
1043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1044f1af5d2fSBarry Smith 
1045f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1046f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1047f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1049f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056f1af5d2fSBarry Smith     v += 49;
1057f1af5d2fSBarry Smith 
1058f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1059f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1062f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v  += 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1073f1af5d2fSBarry Smith     idx += 7;
1074f1af5d2fSBarry Smith   }
1075f1af5d2fSBarry Smith   /* backward solve the L^T */
1076f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1077f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1078f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1079f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1080f1af5d2fSBarry Smith     idt  = 7*i;
1081f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1083f1af5d2fSBarry Smith     while (nz--) {
1084f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1085f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092f1af5d2fSBarry Smith       v -= 49;
1093f1af5d2fSBarry Smith     }
1094f1af5d2fSBarry Smith   }
1095f1af5d2fSBarry Smith 
1096f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1097f1af5d2fSBarry Smith   ii = 0;
1098f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1099f1af5d2fSBarry Smith     ir      = 7*r[i];
1100f1af5d2fSBarry Smith     x[ir]   = t[ii];
1101f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1102f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1103f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1104f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1105f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1106f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1107f1af5d2fSBarry Smith     ii += 7;
1108f1af5d2fSBarry Smith   }
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1116f1af5d2fSBarry Smith }
1117f1af5d2fSBarry Smith 
11184e2b4712SSatish Balay /* ----------------------------------------------------------- */
11194a2ae208SSatish Balay #undef __FUNCT__
11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11224e2b4712SSatish Balay {
11234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11256849ba73SBarry Smith   PetscErrorCode ierr;
11265d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11285d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11293f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
113087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11314e2b4712SSatish Balay 
11324e2b4712SSatish Balay   PetscFunctionBegin;
11331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135f1af5d2fSBarry Smith   t  = a->solve_work;
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11394e2b4712SSatish Balay 
11404e2b4712SSatish Balay   /* forward solve the lower triangular */
114187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11424e2b4712SSatish Balay   for (i=1; i<n; i++) {
11434e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11444e2b4712SSatish Balay     vi  = aj + ai[i];
11454e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1146f1af5d2fSBarry Smith     s = t + bs*i;
114787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
11524e2b4712SSatish Balay   }
11534e2b4712SSatish Balay   /* backward solve the upper triangular */
1154d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11564e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11574e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11584e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11604e2b4712SSatish Balay     while (nz--) {
1161f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11624e2b4712SSatish Balay       v += bs2;
11634e2b4712SSatish Balay     }
1164f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11664e2b4712SSatish Balay   }
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11734e2b4712SSatish Balay   PetscFunctionReturn(0);
11744e2b4712SSatish Balay }
11754e2b4712SSatish Balay 
11764a2ae208SSatish Balay #undef __FUNCT__
11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11794e2b4712SSatish Balay {
11804e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11814e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11826849ba73SBarry Smith   PetscErrorCode ierr;
11835d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11845d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11853f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11884e2b4712SSatish Balay 
11894e2b4712SSatish Balay   PetscFunctionBegin;
11901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192f1af5d2fSBarry Smith   t  = a->solve_work;
11934e2b4712SSatish Balay 
11944e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11954e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11964e2b4712SSatish Balay 
11974e2b4712SSatish Balay   /* forward solve the lower triangular */
11984e2b4712SSatish Balay   idx    = 7*(*r++);
1199f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1200f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12024e2b4712SSatish Balay 
12034e2b4712SSatish Balay   for (i=1; i<n; i++) {
12044e2b4712SSatish Balay     v     = aa + 49*ai[i];
12054e2b4712SSatish Balay     vi    = aj + ai[i];
12064e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12074e2b4712SSatish Balay     idx   = 7*(*r++);
1208f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12104e2b4712SSatish Balay     while (nz--) {
12114e2b4712SSatish Balay       idx   = 7*(*vi++);
1212f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1214f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1215f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12224e2b4712SSatish Balay       v += 49;
12234e2b4712SSatish Balay     }
12244e2b4712SSatish Balay     idx = 7*i;
1225f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1226f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12284e2b4712SSatish Balay   }
12294e2b4712SSatish Balay   /* backward solve the upper triangular */
12304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12314e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12324e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12334e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12344e2b4712SSatish Balay     idt  = 7*i;
1235f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1236f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12384e2b4712SSatish Balay     while (nz--) {
12394e2b4712SSatish Balay       idx   = 7*(*vi++);
1240f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1241f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1243f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12504e2b4712SSatish Balay       v += 49;
12514e2b4712SSatish Balay     }
12524e2b4712SSatish Balay     idc = 7*(*c--);
12534e2b4712SSatish Balay     v   = aa + 49*diag[i];
1254f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12684e2b4712SSatish Balay   }
12694e2b4712SSatish Balay 
12704e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12714e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12721ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12754e2b4712SSatish Balay   PetscFunctionReturn(0);
12764e2b4712SSatish Balay }
12774e2b4712SSatish Balay 
12784a2ae208SSatish Balay #undef __FUNCT__
12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
12818f690400SShri Abhyankar {
12828f690400SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
12838f690400SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
12848f690400SShri Abhyankar   PetscErrorCode ierr;
12858f690400SShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
128629b92fc1SShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
12878f690400SShri Abhyankar   MatScalar      *aa=a->a,*v;
12888f690400SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
12898f690400SShri Abhyankar   PetscScalar    *x,*b,*t;
12908f690400SShri Abhyankar 
12918f690400SShri Abhyankar   PetscFunctionBegin;
12928f690400SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12938f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
12948f690400SShri Abhyankar   t  = a->solve_work;
12958f690400SShri Abhyankar 
12968f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
129729b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
12988f690400SShri Abhyankar 
12998f690400SShri Abhyankar   /* forward solve the lower triangular */
130029b92fc1SShri Abhyankar   idx    = 7*r[0];
13018f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
13028f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
13038f690400SShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
13048f690400SShri Abhyankar 
13058f690400SShri Abhyankar   for (i=1; i<n; i++) {
13068f690400SShri Abhyankar     v     = aa + 49*ai[i];
13078f690400SShri Abhyankar     vi    = aj + ai[i];
13088f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
130929b92fc1SShri Abhyankar     idx   = 7*r[i];
13108f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
13118f690400SShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
131229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
131329b92fc1SShri Abhyankar       idx   = 7*vi[m];
13148f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
13158f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
13168f690400SShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
13178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13228f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13238f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13248f690400SShri Abhyankar       v += 49;
13258f690400SShri Abhyankar     }
13268f690400SShri Abhyankar     idx = 7*i;
13278f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
13288f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
13298f690400SShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
13308f690400SShri Abhyankar   }
13318f690400SShri Abhyankar   /* backward solve the upper triangular */
13328f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
13338f690400SShri Abhyankar     k    = 2*n-i;
13348f690400SShri Abhyankar     v    = aa + 49*ai[k];
13358f690400SShri Abhyankar     vi   = aj + ai[k];
13368f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
13378f690400SShri Abhyankar     idt  = 7*i;
13388f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
13398f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
13408f690400SShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
134129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
134229b92fc1SShri Abhyankar       idx   = 7*vi[m];
13438f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
13448f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
13458f690400SShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
13468f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13478f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13488f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13498f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13508f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13518f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13528f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13538f690400SShri Abhyankar       v += 49;
13548f690400SShri Abhyankar     }
135529b92fc1SShri Abhyankar     idc = 7*c[i];
13568f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
13578f690400SShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
13588f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
13598f690400SShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
13608f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
13618f690400SShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
13628f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
13638f690400SShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
13648f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
13658f690400SShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
13668f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
13678f690400SShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
13688f690400SShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
13698f690400SShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13708f690400SShri Abhyankar   }
13718f690400SShri Abhyankar 
13728f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
13738f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13748f690400SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13758f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
13768f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
13778f690400SShri Abhyankar   PetscFunctionReturn(0);
13788f690400SShri Abhyankar }
13798f690400SShri Abhyankar 
13808f690400SShri Abhyankar #undef __FUNCT__
13814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
138315091d37SBarry Smith {
138415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1385690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1386dfbe8321SBarry Smith   PetscErrorCode    ierr;
1387690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1388d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1389d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1390d9fead3dSBarry Smith   const PetscScalar *b;
139115091d37SBarry Smith 
139215091d37SBarry Smith   PetscFunctionBegin;
1393d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
139515091d37SBarry Smith   /* forward solve the lower triangular */
139615091d37SBarry Smith   idx    = 0;
139715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
139815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
139915091d37SBarry Smith   x[6] = b[6+idx];
140015091d37SBarry Smith   for (i=1; i<n; i++) {
140115091d37SBarry Smith     v     =  aa + 49*ai[i];
140215091d37SBarry Smith     vi    =  aj + ai[i];
140315091d37SBarry Smith     nz    =  diag[i] - ai[i];
140415091d37SBarry Smith     idx   =  7*i;
1405f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1406f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1407f1af5d2fSBarry Smith     s7  =  b[6+idx];
140815091d37SBarry Smith     while (nz--) {
140915091d37SBarry Smith       jdx   = 7*(*vi++);
141015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
141115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
141215091d37SBarry Smith       x7    = x[6+jdx];
1413f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1414f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1415f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1416f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1417f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1418f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1419f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
142015091d37SBarry Smith       v += 49;
142115091d37SBarry Smith      }
1422f1af5d2fSBarry Smith     x[idx]   = s1;
1423f1af5d2fSBarry Smith     x[1+idx] = s2;
1424f1af5d2fSBarry Smith     x[2+idx] = s3;
1425f1af5d2fSBarry Smith     x[3+idx] = s4;
1426f1af5d2fSBarry Smith     x[4+idx] = s5;
1427f1af5d2fSBarry Smith     x[5+idx] = s6;
1428f1af5d2fSBarry Smith     x[6+idx] = s7;
142915091d37SBarry Smith   }
143015091d37SBarry Smith   /* backward solve the upper triangular */
143115091d37SBarry Smith   for (i=n-1; i>=0; i--){
143215091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
143315091d37SBarry Smith     vi   = aj + diag[i] + 1;
143415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
143515091d37SBarry Smith     idt  = 7*i;
1436f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1437f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1438f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1439f1af5d2fSBarry Smith     s7 = x[6+idt];
144015091d37SBarry Smith     while (nz--) {
144115091d37SBarry Smith       idx   = 7*(*vi++);
144215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
144315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
144415091d37SBarry Smith       x7    = x[6+idx];
1445f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1446f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1447f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1448f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1449f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1450f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1451f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
145215091d37SBarry Smith       v += 49;
145315091d37SBarry Smith     }
145415091d37SBarry Smith     v        = aa + 49*diag[i];
1455f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1456f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1457f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1458f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1459f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1460f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1461f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1462f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1463f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1464f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1465f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1466f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1467f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1468f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
146915091d37SBarry Smith   }
147015091d37SBarry Smith 
1471d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
147415091d37SBarry Smith   PetscFunctionReturn(0);
147515091d37SBarry Smith }
147615091d37SBarry Smith 
14774a2ae208SSatish Balay #undef __FUNCT__
1478cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1479cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1480cee9d6f2SShri Abhyankar {
1481cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
14826464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1483cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1484cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1485cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1486cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1487cee9d6f2SShri Abhyankar     PetscScalar       *x;
1488cee9d6f2SShri Abhyankar     const PetscScalar *b;
1489cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1490cee9d6f2SShri Abhyankar 
1491cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1492cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1493cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1494cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1495cee9d6f2SShri Abhyankar     idx    = 0;
1496cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1497cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1498cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1499cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1500cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1501cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1502cee9d6f2SShri Abhyankar       idx   = bs*i;
1503cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1504cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
15056464896eSShri Abhyankar        for(k=0;k<nz;k++) {
15066464896eSShri Abhyankar           jdx   = bs*vi[k];
1507cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1508cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1509cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1510cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1511cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1512cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1513cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1514cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1515cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1516cee9d6f2SShri Abhyankar           v   +=  bs2;
1517cee9d6f2SShri Abhyankar         }
1518cee9d6f2SShri Abhyankar 
1519cee9d6f2SShri Abhyankar        x[idx]   = s1;
1520cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1521cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1522cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1523cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1524cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1525cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1526cee9d6f2SShri Abhyankar     }
1527cee9d6f2SShri Abhyankar 
1528cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1529cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1530cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1531cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1532cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1533cee9d6f2SShri Abhyankar      idt = bs*i;
1534cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1535cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
15366464896eSShri Abhyankar     for(k=0;k<nz;k++) {
15376464896eSShri Abhyankar       idx   = bs*vi[k];
1538cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1539cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1540cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1541cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1542cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1543cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1544cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1545cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1546cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1547cee9d6f2SShri Abhyankar         v   +=  bs2;
1548cee9d6f2SShri Abhyankar     }
1549cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1550cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1551cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1552cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1553cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1554cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1555cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1556cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1557cee9d6f2SShri Abhyankar   }
1558cee9d6f2SShri Abhyankar 
1559cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1560cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1561cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1562cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1563cee9d6f2SShri Abhyankar }
1564cee9d6f2SShri Abhyankar 
1565cee9d6f2SShri Abhyankar #undef __FUNCT__
156653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
156753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
156853cca76cSShri Abhyankar {
156953cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
157053cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
157153cca76cSShri Abhyankar     PetscErrorCode    ierr;
157253cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
157353cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
157453cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
157553cca76cSShri Abhyankar     PetscScalar       *x;
157653cca76cSShri Abhyankar     const PetscScalar *b;
157753cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
157853cca76cSShri Abhyankar 
157953cca76cSShri Abhyankar     PetscFunctionBegin;
158053cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
158153cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
158253cca76cSShri Abhyankar     /* forward solve the lower triangular */
158353cca76cSShri Abhyankar     idx    = 0;
158453cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
158553cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
158653cca76cSShri Abhyankar     for (i=1; i<n; i++) {
158753cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
158853cca76cSShri Abhyankar        vi   = aj + ai[i];
158953cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
159053cca76cSShri Abhyankar       idx   = bs*i;
159153cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
159253cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
159353cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
159453cca76cSShri Abhyankar           jdx   = bs*vi[k];
159553cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
159653cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
159753cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
159853cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
159953cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
160053cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
160153cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
160253cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
160353cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
160453cca76cSShri Abhyankar           v   +=  bs2;
160553cca76cSShri Abhyankar         }
160653cca76cSShri Abhyankar 
160753cca76cSShri Abhyankar        x[idx]   = s1;
160853cca76cSShri Abhyankar        x[1+idx] = s2;
160953cca76cSShri Abhyankar        x[2+idx] = s3;
161053cca76cSShri Abhyankar        x[3+idx] = s4;
161153cca76cSShri Abhyankar        x[4+idx] = s5;
161253cca76cSShri Abhyankar        x[5+idx] = s6;
161353cca76cSShri Abhyankar        x[6+idx] = s7;
161453cca76cSShri Abhyankar     }
161553cca76cSShri Abhyankar 
161653cca76cSShri Abhyankar    /* backward solve the upper triangular */
161753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
161853cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
161953cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
162053cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
162153cca76cSShri Abhyankar      idt = bs*i;
162253cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
162353cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
162453cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
162553cca76cSShri Abhyankar       idx   = bs*vi[k];
162653cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
162753cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
162853cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
162953cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
163053cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
163153cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
163253cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
163353cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
163453cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
163553cca76cSShri Abhyankar         v   +=  bs2;
163653cca76cSShri Abhyankar     }
163753cca76cSShri Abhyankar     /* x = inv_diagonal*x */
163853cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
163953cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
164053cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
164153cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
164253cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
164353cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
164453cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
164553cca76cSShri Abhyankar   }
164653cca76cSShri Abhyankar 
164753cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
164853cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
164953cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
165053cca76cSShri Abhyankar   PetscFunctionReturn(0);
165153cca76cSShri Abhyankar }
165253cca76cSShri Abhyankar 
165353cca76cSShri Abhyankar #undef __FUNCT__
16544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1655dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
165615091d37SBarry Smith {
165715091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
165815091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
16596849ba73SBarry Smith   PetscErrorCode    ierr;
16605d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
16615d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1662d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1663d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1664d9fead3dSBarry Smith   const PetscScalar *b;
166515091d37SBarry Smith   PetscFunctionBegin;
1666d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16671ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668f1af5d2fSBarry Smith   t  = a->solve_work;
166915091d37SBarry Smith 
167015091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
167115091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
167215091d37SBarry Smith 
167315091d37SBarry Smith   /* forward solve the lower triangular */
167415091d37SBarry Smith   idx    = 6*(*r++);
1675f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1676f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1677f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
167815091d37SBarry Smith   for (i=1; i<n; i++) {
167915091d37SBarry Smith     v     = aa + 36*ai[i];
168015091d37SBarry Smith     vi    = aj + ai[i];
168115091d37SBarry Smith     nz    = diag[i] - ai[i];
168215091d37SBarry Smith     idx   = 6*(*r++);
1683f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1684f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
168515091d37SBarry Smith     while (nz--) {
168615091d37SBarry Smith       idx   = 6*(*vi++);
1687f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1688f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1689f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1690f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1691f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1692f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1693f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1694f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
169515091d37SBarry Smith       v += 36;
169615091d37SBarry Smith     }
169715091d37SBarry Smith     idx = 6*i;
1698f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1699f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1700f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
170115091d37SBarry Smith   }
170215091d37SBarry Smith   /* backward solve the upper triangular */
170315091d37SBarry Smith   for (i=n-1; i>=0; i--){
170415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
170515091d37SBarry Smith     vi   = aj + diag[i] + 1;
170615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
170715091d37SBarry Smith     idt  = 6*i;
1708f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1709f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1710f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
171115091d37SBarry Smith     while (nz--) {
171215091d37SBarry Smith       idx   = 6*(*vi++);
1713f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1714f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1715f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1716f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1717f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1718f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1719f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1720f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1721f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
172215091d37SBarry Smith       v += 36;
172315091d37SBarry Smith     }
172415091d37SBarry Smith     idc = 6*(*c--);
172515091d37SBarry Smith     v   = aa + 36*diag[i];
1726f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1727f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1728f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1729f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1730f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1731f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1732f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1733f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1734f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1735f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1736f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1737f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
173815091d37SBarry Smith   }
173915091d37SBarry Smith 
174015091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
174115091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1742d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1744dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
174515091d37SBarry Smith   PetscFunctionReturn(0);
174615091d37SBarry Smith }
174715091d37SBarry Smith 
17484a2ae208SSatish Balay #undef __FUNCT__
17498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
17508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
17518f690400SShri Abhyankar {
17528f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17538f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
17548f690400SShri Abhyankar   PetscErrorCode    ierr;
17558f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
175629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
17578f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
17588f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
17598f690400SShri Abhyankar   const PetscScalar *b;
17608f690400SShri Abhyankar   PetscFunctionBegin;
17618f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17628f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
17638f690400SShri Abhyankar   t  = a->solve_work;
17648f690400SShri Abhyankar 
17658f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
176629b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
17678f690400SShri Abhyankar 
17688f690400SShri Abhyankar   /* forward solve the lower triangular */
176929b92fc1SShri Abhyankar   idx    = 6*r[0];
17708f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
17718f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
17728f690400SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
17738f690400SShri Abhyankar   for (i=1; i<n; i++) {
17748f690400SShri Abhyankar     v     = aa + 36*ai[i];
17758f690400SShri Abhyankar     vi    = aj + ai[i];
17768f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
177729b92fc1SShri Abhyankar     idx   = 6*r[i];
17788f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17798f690400SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
178029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
178129b92fc1SShri Abhyankar       idx   = 6*vi[m];
17828f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
17838f690400SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
17848f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
17858f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
17868f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
17878f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
17888f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
17898f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
17908f690400SShri Abhyankar       v += 36;
17918f690400SShri Abhyankar     }
17928f690400SShri Abhyankar     idx = 6*i;
17938f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
17948f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
17958f690400SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
17968f690400SShri Abhyankar   }
17978f690400SShri Abhyankar   /* backward solve the upper triangular */
17988f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
17998f690400SShri Abhyankar     k    = 2*n-i;
18008f690400SShri Abhyankar     v    = aa + 36*ai[k];
18018f690400SShri Abhyankar     vi   = aj + ai[k];
18028f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
18038f690400SShri Abhyankar     idt  = 6*i;
18048f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
18058f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
18068f690400SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
180729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
180829b92fc1SShri Abhyankar       idx   = 6*vi[m];
18098f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
18108f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
18118f690400SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
18128f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
18138f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
18148f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
18158f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
18168f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
18178f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
18188f690400SShri Abhyankar       v += 36;
18198f690400SShri Abhyankar     }
182029b92fc1SShri Abhyankar     idc = 6*c[i];
18218f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
18228f690400SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
18238f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
18248f690400SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
18258f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
18268f690400SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
18278f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
18288f690400SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
18298f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
18308f690400SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
18318f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
18328f690400SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
18338f690400SShri Abhyankar   }
18348f690400SShri Abhyankar 
18358f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18368f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18378f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18388f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
18398f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
18408f690400SShri Abhyankar   PetscFunctionReturn(0);
18418f690400SShri Abhyankar }
18428f690400SShri Abhyankar 
1843*6506fda5SShri Abhyankar #undef __FUNCT__
1844*6506fda5SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2"
1845*6506fda5SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1846*6506fda5SShri Abhyankar {
1847*6506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1848*6506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1849*6506fda5SShri Abhyankar   PetscErrorCode    ierr;
1850*6506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
1851*6506fda5SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
1852*6506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1853*6506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1854*6506fda5SShri Abhyankar   const PetscScalar *b;
1855*6506fda5SShri Abhyankar   PetscFunctionBegin;
1856*6506fda5SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1857*6506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1858*6506fda5SShri Abhyankar   t  = a->solve_work;
1859*6506fda5SShri Abhyankar 
1860*6506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1861*6506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1862*6506fda5SShri Abhyankar 
1863*6506fda5SShri Abhyankar   /* forward solve the lower triangular */
1864*6506fda5SShri Abhyankar   idx    = 6*r[0];
1865*6506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
1866*6506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
1867*6506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
1868*6506fda5SShri Abhyankar   for (i=1; i<n; i++) {
1869*6506fda5SShri Abhyankar     v     = aa + 36*ai[i];
1870*6506fda5SShri Abhyankar     vi    = aj + ai[i];
1871*6506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
1872*6506fda5SShri Abhyankar     idx   = 6*r[i];
1873*6506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1874*6506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
1875*6506fda5SShri Abhyankar     for(m=0;m<nz;m++){
1876*6506fda5SShri Abhyankar       idx   = 6*vi[m];
1877*6506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1878*6506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1879*6506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1880*6506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1881*6506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1882*6506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1883*6506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1884*6506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1885*6506fda5SShri Abhyankar       v += 36;
1886*6506fda5SShri Abhyankar     }
1887*6506fda5SShri Abhyankar     idx = 6*i;
1888*6506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
1889*6506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
1890*6506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
1891*6506fda5SShri Abhyankar   }
1892*6506fda5SShri Abhyankar   /* backward solve the upper triangular */
1893*6506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
1894*6506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
1895*6506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
1896*6506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
1897*6506fda5SShri Abhyankar     idt  = 6*i;
1898*6506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
1899*6506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
1900*6506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
1901*6506fda5SShri Abhyankar     for(m=0;m<nz;m++){
1902*6506fda5SShri Abhyankar       idx   = 6*vi[m];
1903*6506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
1904*6506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
1905*6506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
1906*6506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1907*6506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1908*6506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1909*6506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1910*6506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1911*6506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1912*6506fda5SShri Abhyankar       v += 36;
1913*6506fda5SShri Abhyankar     }
1914*6506fda5SShri Abhyankar     idc = 6*c[i];
1915*6506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1916*6506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
1917*6506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1918*6506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
1919*6506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1920*6506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
1921*6506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1922*6506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
1923*6506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1924*6506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
1925*6506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1926*6506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
1927*6506fda5SShri Abhyankar   }
1928*6506fda5SShri Abhyankar 
1929*6506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1930*6506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1931*6506fda5SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1932*6506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1933*6506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1934*6506fda5SShri Abhyankar   PetscFunctionReturn(0);
1935*6506fda5SShri Abhyankar }
19368f690400SShri Abhyankar 
19378f690400SShri Abhyankar #undef __FUNCT__
19384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1939dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
194015091d37SBarry Smith {
194115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1942690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1943dfbe8321SBarry Smith   PetscErrorCode    ierr;
1944690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1945d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1946d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1947d9fead3dSBarry Smith   const PetscScalar *b;
194815091d37SBarry Smith 
194915091d37SBarry Smith   PetscFunctionBegin;
1950d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19511ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195215091d37SBarry Smith   /* forward solve the lower triangular */
195315091d37SBarry Smith   idx    = 0;
195415091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
195515091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
195615091d37SBarry Smith   for (i=1; i<n; i++) {
195715091d37SBarry Smith     v     =  aa + 36*ai[i];
195815091d37SBarry Smith     vi    =  aj + ai[i];
195915091d37SBarry Smith     nz    =  diag[i] - ai[i];
196015091d37SBarry Smith     idx   =  6*i;
1961f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1962f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
196315091d37SBarry Smith     while (nz--) {
196415091d37SBarry Smith       jdx   = 6*(*vi++);
196515091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
196615091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1967f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1968f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1969f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1970f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1971f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1972f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
197315091d37SBarry Smith       v += 36;
197415091d37SBarry Smith      }
1975f1af5d2fSBarry Smith     x[idx]   = s1;
1976f1af5d2fSBarry Smith     x[1+idx] = s2;
1977f1af5d2fSBarry Smith     x[2+idx] = s3;
1978f1af5d2fSBarry Smith     x[3+idx] = s4;
1979f1af5d2fSBarry Smith     x[4+idx] = s5;
1980f1af5d2fSBarry Smith     x[5+idx] = s6;
198115091d37SBarry Smith   }
198215091d37SBarry Smith   /* backward solve the upper triangular */
198315091d37SBarry Smith   for (i=n-1; i>=0; i--){
198415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
198515091d37SBarry Smith     vi   = aj + diag[i] + 1;
198615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
198715091d37SBarry Smith     idt  = 6*i;
1988f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1989f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1990f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
199115091d37SBarry Smith     while (nz--) {
199215091d37SBarry Smith       idx   = 6*(*vi++);
199315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
199415091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1995f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1996f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1997f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1998f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1999f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2000f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
200115091d37SBarry Smith       v += 36;
200215091d37SBarry Smith     }
200315091d37SBarry Smith     v        = aa + 36*diag[i];
2004f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2005f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2006f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2007f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2008f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2009f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
201015091d37SBarry Smith   }
201115091d37SBarry Smith 
2012d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2014dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
201515091d37SBarry Smith   PetscFunctionReturn(0);
201615091d37SBarry Smith }
201715091d37SBarry Smith 
20184a2ae208SSatish Balay #undef __FUNCT__
2019cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2020cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2021cee9d6f2SShri Abhyankar {
2022cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
20236464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2024cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
2025cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
2026cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2027cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2028cee9d6f2SShri Abhyankar     PetscScalar       *x;
2029cee9d6f2SShri Abhyankar     const PetscScalar *b;
2030cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2031cee9d6f2SShri Abhyankar 
2032cee9d6f2SShri Abhyankar     PetscFunctionBegin;
2033cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2034cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2035cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
2036cee9d6f2SShri Abhyankar     idx    = 0;
2037cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2038cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
2039cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
2040cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
2041cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
2042cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
2043cee9d6f2SShri Abhyankar       idx   = bs*i;
2044cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2045cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
20466464896eSShri Abhyankar        for(k=0;k<nz;k++){
20476464896eSShri Abhyankar           jdx   = bs*vi[k];
2048cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2049cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2050cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2051cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2052cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2053cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2054cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2055cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2056cee9d6f2SShri Abhyankar           v   +=  bs2;
2057cee9d6f2SShri Abhyankar         }
2058cee9d6f2SShri Abhyankar 
2059cee9d6f2SShri Abhyankar        x[idx]   = s1;
2060cee9d6f2SShri Abhyankar        x[1+idx] = s2;
2061cee9d6f2SShri Abhyankar        x[2+idx] = s3;
2062cee9d6f2SShri Abhyankar        x[3+idx] = s4;
2063cee9d6f2SShri Abhyankar        x[4+idx] = s5;
2064cee9d6f2SShri Abhyankar        x[5+idx] = s6;
2065cee9d6f2SShri Abhyankar     }
2066cee9d6f2SShri Abhyankar 
2067cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
2068cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2069cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
2070cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
2071cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2072cee9d6f2SShri Abhyankar      idt = bs*i;
2073cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2074cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
20756464896eSShri Abhyankar      for(k=0;k<nz;k++){
20766464896eSShri Abhyankar       idx   = bs*vi[k];
2077cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2078cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
2079cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2080cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2081cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2082cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2083cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2084cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2085cee9d6f2SShri Abhyankar         v   +=  bs2;
2086cee9d6f2SShri Abhyankar     }
2087cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2088cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2089cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2090cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2091cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2092cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2093cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2094cee9d6f2SShri Abhyankar   }
2095cee9d6f2SShri Abhyankar 
2096cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2097cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2098cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2099cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2100cee9d6f2SShri Abhyankar }
21018f690400SShri Abhyankar 
2102cee9d6f2SShri Abhyankar #undef __FUNCT__
210353cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
210453cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
210553cca76cSShri Abhyankar {
210653cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
210753cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
210853cca76cSShri Abhyankar     PetscErrorCode    ierr;
210953cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
211053cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
211153cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
211253cca76cSShri Abhyankar     PetscScalar       *x;
211353cca76cSShri Abhyankar     const PetscScalar *b;
211453cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
211553cca76cSShri Abhyankar 
211653cca76cSShri Abhyankar     PetscFunctionBegin;
211753cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
211853cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
211953cca76cSShri Abhyankar     /* forward solve the lower triangular */
212053cca76cSShri Abhyankar     idx    = 0;
212153cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
212253cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
212353cca76cSShri Abhyankar     for (i=1; i<n; i++) {
212453cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
212553cca76cSShri Abhyankar        vi   = aj + ai[i];
212653cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
212753cca76cSShri Abhyankar       idx   = bs*i;
212853cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
212953cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
213053cca76cSShri Abhyankar        for(k=0;k<nz;k++){
213153cca76cSShri Abhyankar           jdx   = bs*vi[k];
213253cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
213353cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
213453cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
213553cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
213653cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
213753cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
213853cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
213953cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
214053cca76cSShri Abhyankar           v   +=  bs2;
214153cca76cSShri Abhyankar         }
214253cca76cSShri Abhyankar 
214353cca76cSShri Abhyankar        x[idx]   = s1;
214453cca76cSShri Abhyankar        x[1+idx] = s2;
214553cca76cSShri Abhyankar        x[2+idx] = s3;
214653cca76cSShri Abhyankar        x[3+idx] = s4;
214753cca76cSShri Abhyankar        x[4+idx] = s5;
214853cca76cSShri Abhyankar        x[5+idx] = s6;
214953cca76cSShri Abhyankar     }
215053cca76cSShri Abhyankar 
215153cca76cSShri Abhyankar    /* backward solve the upper triangular */
215253cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
215353cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
215453cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
215553cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
215653cca76cSShri Abhyankar      idt = bs*i;
215753cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
215853cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
215953cca76cSShri Abhyankar      for(k=0;k<nz;k++){
216053cca76cSShri Abhyankar       idx   = bs*vi[k];
216153cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
216253cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
216353cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
216453cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
216553cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
216653cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
216753cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
216853cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
216953cca76cSShri Abhyankar         v   +=  bs2;
217053cca76cSShri Abhyankar     }
217153cca76cSShri Abhyankar     /* x = inv_diagonal*x */
217253cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
217353cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
217453cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
217553cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
217653cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
217753cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
217853cca76cSShri Abhyankar   }
217953cca76cSShri Abhyankar 
218053cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
218153cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
218253cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
218353cca76cSShri Abhyankar   PetscFunctionReturn(0);
218453cca76cSShri Abhyankar }
218553cca76cSShri Abhyankar 
218653cca76cSShri Abhyankar #undef __FUNCT__
21874a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2188dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
21894e2b4712SSatish Balay {
21904e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21914e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
21926849ba73SBarry Smith   PetscErrorCode    ierr;
21935d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
21945d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2195d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2196d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2197d9fead3dSBarry Smith   const PetscScalar *b;
21984e2b4712SSatish Balay 
21994e2b4712SSatish Balay   PetscFunctionBegin;
2200d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2202f1af5d2fSBarry Smith   t  = a->solve_work;
22034e2b4712SSatish Balay 
22044e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22054e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
22064e2b4712SSatish Balay 
22074e2b4712SSatish Balay   /* forward solve the lower triangular */
22084e2b4712SSatish Balay   idx    = 5*(*r++);
2209f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2210f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
22114e2b4712SSatish Balay   for (i=1; i<n; i++) {
22124e2b4712SSatish Balay     v     = aa + 25*ai[i];
22134e2b4712SSatish Balay     vi    = aj + ai[i];
22144e2b4712SSatish Balay     nz    = diag[i] - ai[i];
22154e2b4712SSatish Balay     idx   = 5*(*r++);
2216f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2217f1af5d2fSBarry Smith     s5  = b[4+idx];
22184e2b4712SSatish Balay     while (nz--) {
22194e2b4712SSatish Balay       idx   = 5*(*vi++);
2220f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2221f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2222f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2223f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2224f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2225f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2226f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
22274e2b4712SSatish Balay       v += 25;
22284e2b4712SSatish Balay     }
22294e2b4712SSatish Balay     idx = 5*i;
2230f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2231f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
22324e2b4712SSatish Balay   }
22334e2b4712SSatish Balay   /* backward solve the upper triangular */
22344e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
22354e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
22364e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
22374e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
22384e2b4712SSatish Balay     idt  = 5*i;
2239f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2240f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
22414e2b4712SSatish Balay     while (nz--) {
22424e2b4712SSatish Balay       idx   = 5*(*vi++);
2243f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2244f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2245f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2246f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2247f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2248f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2249f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
22504e2b4712SSatish Balay       v += 25;
22514e2b4712SSatish Balay     }
22524e2b4712SSatish Balay     idc = 5*(*c--);
22534e2b4712SSatish Balay     v   = aa + 25*diag[i];
2254f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2255f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
2256f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2257f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
2258f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2259f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
2260f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2261f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
2262f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2263f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
22644e2b4712SSatish Balay   }
22654e2b4712SSatish Balay 
22664e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22674e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2268d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22691ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2270dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
22714e2b4712SSatish Balay   PetscFunctionReturn(0);
22724e2b4712SSatish Balay }
22734e2b4712SSatish Balay 
22744a2ae208SSatish Balay #undef __FUNCT__
22758f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
22768f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
22778f690400SShri Abhyankar {
22788f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22798f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
22808f690400SShri Abhyankar   PetscErrorCode    ierr;
22818f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
228229b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
22838f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
22848f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
22858f690400SShri Abhyankar   const PetscScalar *b;
22868f690400SShri Abhyankar 
22878f690400SShri Abhyankar   PetscFunctionBegin;
22888f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22898f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22908f690400SShri Abhyankar   t  = a->solve_work;
22918f690400SShri Abhyankar 
22928f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
229329b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22948f690400SShri Abhyankar 
22958f690400SShri Abhyankar   /* forward solve the lower triangular */
229629b92fc1SShri Abhyankar   idx    = 5*r[0];
22978f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
22988f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
22998f690400SShri Abhyankar   for (i=1; i<n; i++) {
23008f690400SShri Abhyankar     v     = aa + 25*ai[i];
23018f690400SShri Abhyankar     vi    = aj + ai[i];
23028f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
230329b92fc1SShri Abhyankar     idx   = 5*r[i];
23048f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
23058f690400SShri Abhyankar     s5  = b[4+idx];
230629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
230729b92fc1SShri Abhyankar       idx   = 5*vi[m];
23088f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
23098f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
23108f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
23118f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
23128f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
23138f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
23148f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
23158f690400SShri Abhyankar       v += 25;
23168f690400SShri Abhyankar     }
23178f690400SShri Abhyankar     idx = 5*i;
23188f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
23198f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
23208f690400SShri Abhyankar   }
23218f690400SShri Abhyankar   /* backward solve the upper triangular */
23228f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
23238f690400SShri Abhyankar     k    = 2*n-i;
23248f690400SShri Abhyankar     v    = aa + 25*ai[k];
23258f690400SShri Abhyankar     vi   = aj + ai[k];
23268f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
23278f690400SShri Abhyankar     idt  = 5*i;
23288f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
23298f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
233029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
233129b92fc1SShri Abhyankar       idx   = 5*vi[m];
23328f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
23338f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
23348f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
23358f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
23368f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
23378f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
23388f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
23398f690400SShri Abhyankar       v += 25;
23408f690400SShri Abhyankar     }
234129b92fc1SShri Abhyankar     idc = 5*c[i];
23428f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
23438f690400SShri Abhyankar                                  v[15]*s4+v[20]*s5;
23448f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
23458f690400SShri Abhyankar                                  v[16]*s4+v[21]*s5;
23468f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
23478f690400SShri Abhyankar                                  v[17]*s4+v[22]*s5;
23488f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
23498f690400SShri Abhyankar                                  v[18]*s4+v[23]*s5;
23508f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
23518f690400SShri Abhyankar                                  v[19]*s4+v[24]*s5;
23528f690400SShri Abhyankar   }
23538f690400SShri Abhyankar 
23548f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23558f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23568f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23578f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23588f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
23598f690400SShri Abhyankar   PetscFunctionReturn(0);
23608f690400SShri Abhyankar }
236178bb4007SShri Abhyankar 
236278bb4007SShri Abhyankar #undef __FUNCT__
236378bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2"
236478bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx)
236578bb4007SShri Abhyankar {
236678bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
236778bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
236878bb4007SShri Abhyankar   PetscErrorCode    ierr;
236978bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
237078bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
237178bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
237278bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
237378bb4007SShri Abhyankar   const PetscScalar *b;
237478bb4007SShri Abhyankar 
237578bb4007SShri Abhyankar   PetscFunctionBegin;
237678bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
237778bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
237878bb4007SShri Abhyankar   t  = a->solve_work;
237978bb4007SShri Abhyankar 
238078bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
238178bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
238278bb4007SShri Abhyankar 
238378bb4007SShri Abhyankar   /* forward solve the lower triangular */
238478bb4007SShri Abhyankar   idx    = 5*r[0];
238578bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
238678bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
238778bb4007SShri Abhyankar   for (i=1; i<n; i++) {
238878bb4007SShri Abhyankar     v     = aa + 25*ai[i];
238978bb4007SShri Abhyankar     vi    = aj + ai[i];
239078bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
239178bb4007SShri Abhyankar     idx   = 5*r[i];
239278bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
239378bb4007SShri Abhyankar     s5  = b[4+idx];
239478bb4007SShri Abhyankar     for(m=0;m<nz;m++){
239578bb4007SShri Abhyankar       idx   = 5*vi[m];
239678bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
239778bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
239878bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
239978bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
240078bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
240178bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
240278bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
240378bb4007SShri Abhyankar       v += 25;
240478bb4007SShri Abhyankar     }
240578bb4007SShri Abhyankar     idx = 5*i;
240678bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
240778bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
240878bb4007SShri Abhyankar   }
240978bb4007SShri Abhyankar   /* backward solve the upper triangular */
241078bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
241178bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
241278bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
241378bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
241478bb4007SShri Abhyankar     idt  = 5*i;
241578bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
241678bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
241778bb4007SShri Abhyankar     for(m=0;m<nz;m++){
241878bb4007SShri Abhyankar       idx   = 5*vi[m];
241978bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
242078bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
242178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
242278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
242378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
242478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
242578bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
242678bb4007SShri Abhyankar       v += 25;
242778bb4007SShri Abhyankar     }
242878bb4007SShri Abhyankar     idc = 5*c[i];
242978bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
243078bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
243178bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
243278bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
243378bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
243478bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
243578bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
243678bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
243778bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
243878bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
243978bb4007SShri Abhyankar   }
244078bb4007SShri Abhyankar 
244178bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
244278bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
244378bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
244478bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
244578bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
244678bb4007SShri Abhyankar   PetscFunctionReturn(0);
244778bb4007SShri Abhyankar }
244878bb4007SShri Abhyankar 
24498f690400SShri Abhyankar #undef __FUNCT__
24504a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2451dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245215091d37SBarry Smith {
245315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2454690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2455dfbe8321SBarry Smith   PetscErrorCode    ierr;
2456690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2457d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2458d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2459d9fead3dSBarry Smith   const PetscScalar *b;
246015091d37SBarry Smith 
246115091d37SBarry Smith   PetscFunctionBegin;
2462d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24631ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
246415091d37SBarry Smith   /* forward solve the lower triangular */
246515091d37SBarry Smith   idx    = 0;
246615091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
246715091d37SBarry Smith   for (i=1; i<n; i++) {
246815091d37SBarry Smith     v     =  aa + 25*ai[i];
246915091d37SBarry Smith     vi    =  aj + ai[i];
247015091d37SBarry Smith     nz    =  diag[i] - ai[i];
247115091d37SBarry Smith     idx   =  5*i;
2472f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
247315091d37SBarry Smith     while (nz--) {
247415091d37SBarry Smith       jdx   = 5*(*vi++);
247515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2476f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2477f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2478f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2479f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2480f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
248115091d37SBarry Smith       v    += 25;
248215091d37SBarry Smith     }
2483f1af5d2fSBarry Smith     x[idx]   = s1;
2484f1af5d2fSBarry Smith     x[1+idx] = s2;
2485f1af5d2fSBarry Smith     x[2+idx] = s3;
2486f1af5d2fSBarry Smith     x[3+idx] = s4;
2487f1af5d2fSBarry Smith     x[4+idx] = s5;
248815091d37SBarry Smith   }
248915091d37SBarry Smith   /* backward solve the upper triangular */
249015091d37SBarry Smith   for (i=n-1; i>=0; i--){
249115091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
249215091d37SBarry Smith     vi   = aj + diag[i] + 1;
249315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
249415091d37SBarry Smith     idt  = 5*i;
2495f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2496f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
249715091d37SBarry Smith     while (nz--) {
249815091d37SBarry Smith       idx   = 5*(*vi++);
249915091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2500f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2501f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2502f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2503f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2504f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
250515091d37SBarry Smith       v    += 25;
250615091d37SBarry Smith     }
250715091d37SBarry Smith     v        = aa + 25*diag[i];
2508f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2509f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2510f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2511f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2512f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
251315091d37SBarry Smith   }
251415091d37SBarry Smith 
2515d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2517dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
251815091d37SBarry Smith   PetscFunctionReturn(0);
251915091d37SBarry Smith }
252015091d37SBarry Smith 
25214a2ae208SSatish Balay #undef __FUNCT__
2522cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2523cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2524cee9d6f2SShri Abhyankar {
2525cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
25266464896eSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2527cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
2528cee9d6f2SShri Abhyankar   PetscInt          jdx;
2529cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2530cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2531cee9d6f2SShri Abhyankar   const PetscScalar *b;
2532cee9d6f2SShri Abhyankar 
2533cee9d6f2SShri Abhyankar   PetscFunctionBegin;
2534cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2535cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2536cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
2537cee9d6f2SShri Abhyankar   idx    = 0;
2538cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2539cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
2540cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
2541cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
2542cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
2543cee9d6f2SShri Abhyankar     idx = 5*i;
2544cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
25456464896eSShri Abhyankar     for(k=0;k<nz;k++) {
25466464896eSShri Abhyankar       jdx   = 5*vi[k];
2547cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2548cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2549cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2550cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2551cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2552cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2553cee9d6f2SShri Abhyankar       v    += 25;
2554cee9d6f2SShri Abhyankar     }
2555cee9d6f2SShri Abhyankar     x[idx]   = s1;
2556cee9d6f2SShri Abhyankar     x[1+idx] = s2;
2557cee9d6f2SShri Abhyankar     x[2+idx] = s3;
2558cee9d6f2SShri Abhyankar     x[3+idx] = s4;
2559cee9d6f2SShri Abhyankar     x[4+idx] = s5;
2560cee9d6f2SShri Abhyankar   }
2561cee9d6f2SShri Abhyankar 
2562cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
2563cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2564cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
2565cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
2566cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2567cee9d6f2SShri Abhyankar     idt = 5*i;
2568cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
2569cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
25706464896eSShri Abhyankar     for(k=0;k<nz;k++){
25716464896eSShri Abhyankar       idx   = 5*vi[k];
2572cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2573cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2574cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2575cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2576cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2577cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2578cee9d6f2SShri Abhyankar       v    += 25;
2579cee9d6f2SShri Abhyankar     }
2580cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2581cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2582cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2583cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2584cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2585cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2586cee9d6f2SShri Abhyankar   }
2587cee9d6f2SShri Abhyankar 
2588cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2589cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2590cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2591cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2592cee9d6f2SShri Abhyankar }
2593cee9d6f2SShri Abhyankar 
2594cee9d6f2SShri Abhyankar #undef __FUNCT__
259553cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
259653cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
259753cca76cSShri Abhyankar {
259853cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
259953cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
260053cca76cSShri Abhyankar   PetscErrorCode    ierr;
260153cca76cSShri Abhyankar   PetscInt          jdx;
260253cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
260353cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
260453cca76cSShri Abhyankar   const PetscScalar *b;
260553cca76cSShri Abhyankar 
260653cca76cSShri Abhyankar   PetscFunctionBegin;
260753cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
260853cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260953cca76cSShri Abhyankar   /* forward solve the lower triangular */
261053cca76cSShri Abhyankar   idx    = 0;
261153cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
261253cca76cSShri Abhyankar   for (i=1; i<n; i++) {
261353cca76cSShri Abhyankar     v   = aa + 25*ai[i];
261453cca76cSShri Abhyankar     vi  = aj + ai[i];
261553cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
261653cca76cSShri Abhyankar     idx = 5*i;
261753cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
261853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
261953cca76cSShri Abhyankar       jdx   = 5*vi[k];
262053cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
262153cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
262253cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
262353cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
262453cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
262553cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
262653cca76cSShri Abhyankar       v    += 25;
262753cca76cSShri Abhyankar     }
262853cca76cSShri Abhyankar     x[idx]   = s1;
262953cca76cSShri Abhyankar     x[1+idx] = s2;
263053cca76cSShri Abhyankar     x[2+idx] = s3;
263153cca76cSShri Abhyankar     x[3+idx] = s4;
263253cca76cSShri Abhyankar     x[4+idx] = s5;
263353cca76cSShri Abhyankar   }
263453cca76cSShri Abhyankar 
263553cca76cSShri Abhyankar   /* backward solve the upper triangular */
263653cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
263753cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
263853cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
263953cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
264053cca76cSShri Abhyankar     idt = 5*i;
264153cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
264253cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
264353cca76cSShri Abhyankar     for(k=0;k<nz;k++){
264453cca76cSShri Abhyankar       idx   = 5*vi[k];
264553cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
264653cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
264753cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
264853cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
264953cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
265053cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
265153cca76cSShri Abhyankar       v    += 25;
265253cca76cSShri Abhyankar     }
265353cca76cSShri Abhyankar     /* x = inv_diagonal*x */
265453cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
265553cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
265653cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
265753cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
265853cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
265953cca76cSShri Abhyankar   }
266053cca76cSShri Abhyankar 
266153cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
266253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
266353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
266453cca76cSShri Abhyankar   PetscFunctionReturn(0);
266553cca76cSShri Abhyankar }
266653cca76cSShri Abhyankar 
266753cca76cSShri Abhyankar #undef __FUNCT__
26684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2669dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
26704e2b4712SSatish Balay {
26714e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
26724e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
26736849ba73SBarry Smith   PetscErrorCode    ierr;
26745d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
26755d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2676d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2677d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2678d9fead3dSBarry Smith   const PetscScalar *b;
26794e2b4712SSatish Balay 
26804e2b4712SSatish Balay   PetscFunctionBegin;
2681d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2683f1af5d2fSBarry Smith   t  = a->solve_work;
26844e2b4712SSatish Balay 
26854e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
26864e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
26874e2b4712SSatish Balay 
26884e2b4712SSatish Balay   /* forward solve the lower triangular */
26894e2b4712SSatish Balay   idx    = 4*(*r++);
2690f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2691f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
26924e2b4712SSatish Balay   for (i=1; i<n; i++) {
26934e2b4712SSatish Balay     v     = aa + 16*ai[i];
26944e2b4712SSatish Balay     vi    = aj + ai[i];
26954e2b4712SSatish Balay     nz    = diag[i] - ai[i];
26964e2b4712SSatish Balay     idx   = 4*(*r++);
2697f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
26984e2b4712SSatish Balay     while (nz--) {
26994e2b4712SSatish Balay       idx   = 4*(*vi++);
2700f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2701f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2702f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2703f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2704f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
27054e2b4712SSatish Balay       v    += 16;
27064e2b4712SSatish Balay     }
27074e2b4712SSatish Balay     idx        = 4*i;
2708f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2709f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
27104e2b4712SSatish Balay   }
27114e2b4712SSatish Balay   /* backward solve the upper triangular */
27124e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
27134e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
27144e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
27154e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
27164e2b4712SSatish Balay     idt  = 4*i;
2717f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2718f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
27194e2b4712SSatish Balay     while (nz--) {
27204e2b4712SSatish Balay       idx   = 4*(*vi++);
2721f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2722f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2723f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2724f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2725f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2726f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
27274e2b4712SSatish Balay       v += 16;
27284e2b4712SSatish Balay     }
27294e2b4712SSatish Balay     idc      = 4*(*c--);
27304e2b4712SSatish Balay     v        = aa + 16*diag[i];
2731f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2732f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2733f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2734f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
27354e2b4712SSatish Balay   }
27364e2b4712SSatish Balay 
27374e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27384e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2739d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2741dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
27424e2b4712SSatish Balay   PetscFunctionReturn(0);
27434e2b4712SSatish Balay }
2744f26ec98cSKris Buschelman 
2745f26ec98cSKris Buschelman #undef __FUNCT__
27468f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
27478f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
27488f690400SShri Abhyankar {
27498f690400SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
27508f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
27518f690400SShri Abhyankar   PetscErrorCode    ierr;
275229b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
27538f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
27548f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
27558f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
27568f690400SShri Abhyankar   const PetscScalar *b;
27578f690400SShri Abhyankar 
27588f690400SShri Abhyankar   PetscFunctionBegin;
27598f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27608f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27618f690400SShri Abhyankar   t  = a->solve_work;
27628f690400SShri Abhyankar 
27638f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
276429b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
27658f690400SShri Abhyankar 
27668f690400SShri Abhyankar   /* forward solve the lower triangular */
276729b92fc1SShri Abhyankar   idx    = 4*r[0];
27688f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
27698f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
27708f690400SShri Abhyankar   for (i=1; i<n; i++) {
27718f690400SShri Abhyankar     v     = aa + 16*ai[i];
27728f690400SShri Abhyankar     vi    = aj + ai[i];
27738f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
277429b92fc1SShri Abhyankar     idx   = 4*r[i];
27758f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
277629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
277729b92fc1SShri Abhyankar       idx   = 4*vi[m];
27788f690400SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
27798f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
27808f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
27818f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
27828f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
27838f690400SShri Abhyankar       v    += 16;
27848f690400SShri Abhyankar     }
27858f690400SShri Abhyankar     idx        = 4*i;
27868f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
27878f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
27888f690400SShri Abhyankar   }
27898f690400SShri Abhyankar   /* backward solve the upper triangular */
27908f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
27918f690400SShri Abhyankar     k    = 2*n-i;
27928f690400SShri Abhyankar     v    = aa + 16*ai[k];
27938f690400SShri Abhyankar     vi   = aj + ai[k];
27948f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
27958f690400SShri Abhyankar     idt  = 4*i;
27968f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
27978f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
279829b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
279929b92fc1SShri Abhyankar       idx   = 4*vi[m];
28008f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
28018f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
28028f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
28038f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
28048f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
28058f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
28068f690400SShri Abhyankar       v += 16;
28078f690400SShri Abhyankar     }
280829b92fc1SShri Abhyankar     idc      = 4*c[i];
28098f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
28108f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
28118f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
28128f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
28138f690400SShri Abhyankar   }
28148f690400SShri Abhyankar 
28158f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28168f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
28178f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28188f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
28198f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
28208f690400SShri Abhyankar   PetscFunctionReturn(0);
28218f690400SShri Abhyankar }
28228f690400SShri Abhyankar 
28238f690400SShri Abhyankar #undef __FUNCT__
282478bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2"
282578bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx)
282678bb4007SShri Abhyankar {
282778bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
282878bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
282978bb4007SShri Abhyankar   PetscErrorCode    ierr;
283078bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
283178bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
283278bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
283378bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
283478bb4007SShri Abhyankar   const PetscScalar *b;
283578bb4007SShri Abhyankar 
283678bb4007SShri Abhyankar   PetscFunctionBegin;
283778bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
283878bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
283978bb4007SShri Abhyankar   t  = a->solve_work;
284078bb4007SShri Abhyankar 
284178bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
284278bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
284378bb4007SShri Abhyankar 
284478bb4007SShri Abhyankar   /* forward solve the lower triangular */
284578bb4007SShri Abhyankar   idx    = 4*r[0];
284678bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
284778bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
284878bb4007SShri Abhyankar   for (i=1; i<n; i++) {
284978bb4007SShri Abhyankar     v     = aa + 16*ai[i];
285078bb4007SShri Abhyankar     vi    = aj + ai[i];
285178bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
285278bb4007SShri Abhyankar     idx   = 4*r[i];
285378bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
285478bb4007SShri Abhyankar     for(m=0;m<nz;m++){
285578bb4007SShri Abhyankar       idx   = 4*vi[m];
285678bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
285778bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
285878bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
285978bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
286078bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
286178bb4007SShri Abhyankar       v    += 16;
286278bb4007SShri Abhyankar     }
286378bb4007SShri Abhyankar     idx        = 4*i;
286478bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
286578bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
286678bb4007SShri Abhyankar   }
286778bb4007SShri Abhyankar   /* backward solve the upper triangular */
286878bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
286978bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
287078bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
287178bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
287278bb4007SShri Abhyankar     idt  = 4*i;
287378bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
287478bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
287578bb4007SShri Abhyankar     for(m=0;m<nz;m++){
287678bb4007SShri Abhyankar       idx   = 4*vi[m];
287778bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
287878bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
287978bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
288078bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
288178bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
288278bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
288378bb4007SShri Abhyankar       v += 16;
288478bb4007SShri Abhyankar     }
288578bb4007SShri Abhyankar     idc      = 4*c[i];
288678bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
288778bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
288878bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
288978bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
289078bb4007SShri Abhyankar   }
289178bb4007SShri Abhyankar 
289278bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
289378bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
289478bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
289578bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
289678bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
289778bb4007SShri Abhyankar   PetscFunctionReturn(0);
289878bb4007SShri Abhyankar }
289978bb4007SShri Abhyankar 
290078bb4007SShri Abhyankar #undef __FUNCT__
2901f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2902dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2903f26ec98cSKris Buschelman {
2904f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2905f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
29066849ba73SBarry Smith   PetscErrorCode    ierr;
29075d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
29085d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2909d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2910d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2911d9fead3dSBarry Smith   PetscScalar       *x;
2912d9fead3dSBarry Smith   const PetscScalar *b;
2913f26ec98cSKris Buschelman 
2914f26ec98cSKris Buschelman   PetscFunctionBegin;
2915d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2917f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2918f26ec98cSKris Buschelman 
2919f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2920f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2921f26ec98cSKris Buschelman 
2922f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2923f26ec98cSKris Buschelman   idx    = 4*(*r++);
2924f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2925f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2926f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2927f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2928f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2929f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2930f26ec98cSKris Buschelman     vi    = aj + ai[i];
2931f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2932f26ec98cSKris Buschelman     idx   = 4*(*r++);
2933f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2934f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2935f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2936f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2937f26ec98cSKris Buschelman     while (nz--) {
2938f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2939f26ec98cSKris Buschelman       x1  = t[idx];
2940f26ec98cSKris Buschelman       x2  = t[1+idx];
2941f26ec98cSKris Buschelman       x3  = t[2+idx];
2942f26ec98cSKris Buschelman       x4  = t[3+idx];
2943f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2944f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2945f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2946f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2947f26ec98cSKris Buschelman       v    += 16;
2948f26ec98cSKris Buschelman     }
2949f26ec98cSKris Buschelman     idx        = 4*i;
2950f26ec98cSKris Buschelman     t[idx]   = s1;
2951f26ec98cSKris Buschelman     t[1+idx] = s2;
2952f26ec98cSKris Buschelman     t[2+idx] = s3;
2953f26ec98cSKris Buschelman     t[3+idx] = s4;
2954f26ec98cSKris Buschelman   }
2955f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2956f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2957f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2958f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2959f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2960f26ec98cSKris Buschelman     idt  = 4*i;
2961f26ec98cSKris Buschelman     s1 = t[idt];
2962f26ec98cSKris Buschelman     s2 = t[1+idt];
2963f26ec98cSKris Buschelman     s3 = t[2+idt];
2964f26ec98cSKris Buschelman     s4 = t[3+idt];
2965f26ec98cSKris Buschelman     while (nz--) {
2966f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2967f26ec98cSKris Buschelman       x1  = t[idx];
2968f26ec98cSKris Buschelman       x2  = t[1+idx];
2969f26ec98cSKris Buschelman       x3  = t[2+idx];
2970f26ec98cSKris Buschelman       x4  = t[3+idx];
2971f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2972f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2973f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2974f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2975f26ec98cSKris Buschelman       v += 16;
2976f26ec98cSKris Buschelman     }
2977f26ec98cSKris Buschelman     idc      = 4*(*c--);
2978f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
2979f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2980f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2981f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2982f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2983f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2984f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2985f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2986f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2987f26ec98cSKris Buschelman  }
2988f26ec98cSKris Buschelman 
2989f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2990f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2991d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2993dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2994f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2995f26ec98cSKris Buschelman }
2996f26ec98cSKris Buschelman 
299724c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
299824c233c2SKris Buschelman 
299924c233c2SKris Buschelman #include PETSC_HAVE_SSE
300024c233c2SKris Buschelman 
300124c233c2SKris Buschelman #undef __FUNCT__
300224c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3003dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
300424c233c2SKris Buschelman {
300524c233c2SKris Buschelman   /*
300624c233c2SKris Buschelman      Note: This code uses demotion of double
300724c233c2SKris Buschelman      to float when performing the mixed-mode computation.
300824c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
300924c233c2SKris Buschelman   */
301024c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
301124c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
30126849ba73SBarry Smith   PetscErrorCode ierr;
30135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
30145d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
301524c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
301687828ca2SBarry Smith   PetscScalar    *x,*b,*t;
301724c233c2SKris Buschelman 
301824c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
301924c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
302024c233c2SKris Buschelman   unsigned long   offset;
302124c233c2SKris Buschelman 
302224c233c2SKris Buschelman   PetscFunctionBegin;
302324c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
302424c233c2SKris Buschelman 
302524c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
302624c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
302724c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
302824c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
302924c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
303024c233c2SKris Buschelman 
30311ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30321ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
303324c233c2SKris Buschelman     t  = a->solve_work;
303424c233c2SKris Buschelman 
303524c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
303624c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
303724c233c2SKris Buschelman 
303824c233c2SKris Buschelman     /* forward solve the lower triangular */
303924c233c2SKris Buschelman     idx  = 4*(*r++);
304024c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
304124c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
304224c233c2SKris Buschelman     v    =  aa + 16*ai[1];
304324c233c2SKris Buschelman 
304424c233c2SKris Buschelman     for (i=1; i<n;) {
304524c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
304624c233c2SKris Buschelman       vi   =  aj      + ai[i];
304724c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
304824c233c2SKris Buschelman       idx  =  4*(*r++);
304924c233c2SKris Buschelman 
305024c233c2SKris Buschelman       /* Demote sum from double to float */
305124c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
305224c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
305324c233c2SKris Buschelman 
305424c233c2SKris Buschelman       while (nz--) {
305524c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
305624c233c2SKris Buschelman         idx = 4*(*vi++);
305724c233c2SKris Buschelman 
305824c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
305924c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
306024c233c2SKris Buschelman 
306124c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
306224c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
306324c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
306424c233c2SKris Buschelman 
306524c233c2SKris Buschelman           /* First Column */
306624c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
306724c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
306824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
306924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
307024c233c2SKris Buschelman 
307124c233c2SKris Buschelman           /* Second Column */
307224c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
307324c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
307424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
307524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
307624c233c2SKris Buschelman 
307724c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
307824c233c2SKris Buschelman 
307924c233c2SKris Buschelman           /* Third Column */
308024c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
308124c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
308224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
308324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
308424c233c2SKris Buschelman 
308524c233c2SKris Buschelman           /* Fourth Column */
308624c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
308724c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
308824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
308924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
309024c233c2SKris Buschelman         SSE_INLINE_END_2
309124c233c2SKris Buschelman 
309224c233c2SKris Buschelman         v  += 16;
309324c233c2SKris Buschelman       }
309424c233c2SKris Buschelman       idx = 4*i;
309524c233c2SKris Buschelman       v   = aa + 16*ai[++i];
309624c233c2SKris Buschelman       PREFETCH_NTA(v);
309724c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
309824c233c2SKris Buschelman 
309924c233c2SKris Buschelman       /* Promote result from float to double */
310024c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
310124c233c2SKris Buschelman     }
310224c233c2SKris Buschelman     /* backward solve the upper triangular */
310324c233c2SKris Buschelman     idt  = 4*(n-1);
310424c233c2SKris Buschelman     ai16 = 16*diag[n-1];
310524c233c2SKris Buschelman     v    = aa + ai16 + 16;
310624c233c2SKris Buschelman     for (i=n-1; i>=0;){
310724c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
310824c233c2SKris Buschelman       vi = aj + diag[i] + 1;
310924c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
311024c233c2SKris Buschelman 
311124c233c2SKris Buschelman       /* Demote accumulator from double to float */
311224c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
311324c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
311424c233c2SKris Buschelman 
311524c233c2SKris Buschelman       while (nz--) {
311624c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
311724c233c2SKris Buschelman         idx = 4*(*vi++);
311824c233c2SKris Buschelman 
311924c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
312024c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
312124c233c2SKris Buschelman 
312224c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
312324c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
312424c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
312524c233c2SKris Buschelman 
312624c233c2SKris Buschelman           /* First Column */
312724c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
312824c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
312924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
313024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
313124c233c2SKris Buschelman 
313224c233c2SKris Buschelman           /* Second Column */
313324c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
313424c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
313524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
313624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
313724c233c2SKris Buschelman 
313824c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
313924c233c2SKris Buschelman 
314024c233c2SKris Buschelman           /* Third Column */
314124c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
314224c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
314324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
314424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
314524c233c2SKris Buschelman 
314624c233c2SKris Buschelman           /* Fourth Column */
314724c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
314824c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
314924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
315024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
315124c233c2SKris Buschelman         SSE_INLINE_END_2
315224c233c2SKris Buschelman         v  += 16;
315324c233c2SKris Buschelman       }
315424c233c2SKris Buschelman       v    = aa + ai16;
315524c233c2SKris Buschelman       ai16 = 16*diag[--i];
315624c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
315724c233c2SKris Buschelman       /*
315824c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
315924c233c2SKris Buschelman          which was inverted as part of the factorization
316024c233c2SKris Buschelman       */
316124c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
316224c233c2SKris Buschelman         /* First Column */
316324c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
316424c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
316524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
316624c233c2SKris Buschelman 
316724c233c2SKris Buschelman         /* Second Column */
316824c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
316924c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
317024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
317124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
317224c233c2SKris Buschelman 
317324c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
317424c233c2SKris Buschelman 
317524c233c2SKris Buschelman         /* Third Column */
317624c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
317724c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
317824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
317924c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
318024c233c2SKris Buschelman 
318124c233c2SKris Buschelman         /* Fourth Column */
318224c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
318324c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
318424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
318524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
318624c233c2SKris Buschelman 
318724c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
318824c233c2SKris Buschelman       SSE_INLINE_END_3
318924c233c2SKris Buschelman 
319024c233c2SKris Buschelman       /* Promote solution from float to double */
319124c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
319224c233c2SKris Buschelman 
319324c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
319424c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
319524c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
319624c233c2SKris Buschelman       idc  = 4*(*c--);
319724c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
319824c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
319924c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
320024c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
320124c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
320224c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
320324c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
320424c233c2SKris Buschelman       SSE_INLINE_END_2
320524c233c2SKris Buschelman       v    = aa + ai16 + 16;
320624c233c2SKris Buschelman       idt -= 4;
320724c233c2SKris Buschelman     }
320824c233c2SKris Buschelman 
320924c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
321024c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
32111ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
32121ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3213dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
321424c233c2SKris Buschelman   SSE_SCOPE_END;
321524c233c2SKris Buschelman   PetscFunctionReturn(0);
321624c233c2SKris Buschelman }
321724c233c2SKris Buschelman 
321824c233c2SKris Buschelman #endif
32190ef38995SBarry Smith 
32200ef38995SBarry Smith 
32214e2b4712SSatish Balay /*
32224e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
32234e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
32244e2b4712SSatish Balay */
32254a2ae208SSatish Balay #undef __FUNCT__
32264a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3227dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
32284e2b4712SSatish Balay {
32294e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3230356650c2SBarry Smith   PetscInt          n=a->mbs;
3231356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
3232dfbe8321SBarry Smith   PetscErrorCode    ierr;
3233356650c2SBarry Smith   const PetscInt    *diag = a->diag;
3234d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
3235d9fead3dSBarry Smith   PetscScalar       *x;
3236d9fead3dSBarry Smith   const PetscScalar *b;
32374e2b4712SSatish Balay 
32384e2b4712SSatish Balay   PetscFunctionBegin;
3239d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
32414e2b4712SSatish Balay 
3242aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
32432853dc0eSBarry Smith   {
324487828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
32452853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
32462853dc0eSBarry Smith   }
3247aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
32482853dc0eSBarry Smith   {
324987828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
32502853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
32512853dc0eSBarry Smith   }
3252aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
32532853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3254e1293385SBarry Smith #else
325530d4dcafSBarry Smith   {
325687828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3257d9fead3dSBarry Smith     const MatScalar *v;
3258356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
3259356650c2SBarry Smith     const PetscInt  *vi;
3260e1293385SBarry Smith 
32614e2b4712SSatish Balay   /* forward solve the lower triangular */
32624e2b4712SSatish Balay   idx    = 0;
3263e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
32644e2b4712SSatish Balay   for (i=1; i<n; i++) {
32654e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
32664e2b4712SSatish Balay     vi    =  aj      + ai[i];
32674e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3268e1293385SBarry Smith     idx   +=  4;
3269f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
32704e2b4712SSatish Balay     while (nz--) {
32714e2b4712SSatish Balay       jdx   = 4*(*vi++);
32724e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3273f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3274f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3275f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3276f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
32774e2b4712SSatish Balay       v    += 16;
32784e2b4712SSatish Balay     }
3279f1af5d2fSBarry Smith     x[idx]   = s1;
3280f1af5d2fSBarry Smith     x[1+idx] = s2;
3281f1af5d2fSBarry Smith     x[2+idx] = s3;
3282f1af5d2fSBarry Smith     x[3+idx] = s4;
32834e2b4712SSatish Balay   }
32844e2b4712SSatish Balay   /* backward solve the upper triangular */
32854e555682SBarry Smith   idt = 4*(n-1);
32864e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
32874e555682SBarry Smith     ai16 = 16*diag[i];
32884e555682SBarry Smith     v    = aa + ai16 + 16;
32894e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
32904e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3291f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3292f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
32934e2b4712SSatish Balay     while (nz--) {
32944e2b4712SSatish Balay       idx   = 4*(*vi++);
32954e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3296f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3297f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3298f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3299f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
33004e2b4712SSatish Balay       v    += 16;
33014e2b4712SSatish Balay     }
33024e555682SBarry Smith     v        = aa + ai16;
3303f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3304f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3305f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3306f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3307329f5518SBarry Smith     idt -= 4;
33084e2b4712SSatish Balay   }
330930d4dcafSBarry Smith   }
3310e1293385SBarry Smith #endif
33114e2b4712SSatish Balay 
3312d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3314dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
33154e2b4712SSatish Balay   PetscFunctionReturn(0);
33164e2b4712SSatish Balay }
33174e2b4712SSatish Balay 
3318f26ec98cSKris Buschelman #undef __FUNCT__
3319cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3320cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3321cee9d6f2SShri Abhyankar {
3322cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
33236464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3324cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3325cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3326cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3327cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3328cee9d6f2SShri Abhyankar     PetscScalar       *x;
3329cee9d6f2SShri Abhyankar     const PetscScalar *b;
3330cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3331cee9d6f2SShri Abhyankar 
3332cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3333cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3334cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3335cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3336cee9d6f2SShri Abhyankar     idx    = 0;
3337cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3338cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3339cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3340cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3341cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3342cee9d6f2SShri Abhyankar       idx   = bs*i;
3343cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
33446464896eSShri Abhyankar       for(k=0;k<nz;k++) {
33456464896eSShri Abhyankar           jdx   = bs*vi[k];
3346cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3347cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3348cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3349cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3350cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3351cee9d6f2SShri Abhyankar 
3352cee9d6f2SShri Abhyankar           v   +=  bs2;
3353cee9d6f2SShri Abhyankar         }
3354cee9d6f2SShri Abhyankar 
3355cee9d6f2SShri Abhyankar        x[idx]   = s1;
3356cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3357cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3358cee9d6f2SShri Abhyankar        x[3+idx] = s4;
3359cee9d6f2SShri Abhyankar     }
3360cee9d6f2SShri Abhyankar 
3361cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3362cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3363cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3364cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3365cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3366cee9d6f2SShri Abhyankar      idt = bs*i;
3367cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3368cee9d6f2SShri Abhyankar 
33696464896eSShri Abhyankar     for(k=0;k<nz;k++){
33706464896eSShri Abhyankar       idx   = bs*vi[k];
3371cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3372cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3373cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3374cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3375cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3376cee9d6f2SShri Abhyankar 
3377cee9d6f2SShri Abhyankar         v   +=  bs2;
3378cee9d6f2SShri Abhyankar     }
3379cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3380cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3381cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3382cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3383cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3384cee9d6f2SShri Abhyankar 
3385cee9d6f2SShri Abhyankar   }
3386cee9d6f2SShri Abhyankar 
3387cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3388cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3389cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3390cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3391cee9d6f2SShri Abhyankar }
3392cee9d6f2SShri Abhyankar 
3393b2b2dd24SShri Abhyankar #undef __FUNCT__
3394b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3395b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3396b2b2dd24SShri Abhyankar {
3397b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3398b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3399b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3400b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3401b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3402b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3403b2b2dd24SShri Abhyankar     PetscScalar       *x;
3404b2b2dd24SShri Abhyankar     const PetscScalar *b;
3405b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3406cee9d6f2SShri Abhyankar 
3407b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3408b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3409b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3410b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3411b2b2dd24SShri Abhyankar     idx    = 0;
3412b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3413b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3414b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3415b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3416b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3417b2b2dd24SShri Abhyankar       idx   = bs*i;
3418b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3419b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
3420b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
3421b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3422b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3423b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3424b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3425b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3426b2b2dd24SShri Abhyankar 
3427b2b2dd24SShri Abhyankar           v   +=  bs2;
3428b2b2dd24SShri Abhyankar         }
3429b2b2dd24SShri Abhyankar 
3430b2b2dd24SShri Abhyankar        x[idx]   = s1;
3431b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3432b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3433b2b2dd24SShri Abhyankar        x[3+idx] = s4;
3434b2b2dd24SShri Abhyankar     }
3435b2b2dd24SShri Abhyankar 
3436b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3437b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3438b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3439b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3440b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3441b2b2dd24SShri Abhyankar      idt = bs*i;
3442b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3443b2b2dd24SShri Abhyankar 
3444b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
3445b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
3446b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3447b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3448b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3449b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3450b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3451b2b2dd24SShri Abhyankar 
3452b2b2dd24SShri Abhyankar         v   +=  bs2;
3453b2b2dd24SShri Abhyankar     }
3454b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3455b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3456b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3457b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3458b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3459b2b2dd24SShri Abhyankar 
3460b2b2dd24SShri Abhyankar   }
3461b2b2dd24SShri Abhyankar 
3462b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3463b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3464b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3465b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3466b2b2dd24SShri Abhyankar }
3467cee9d6f2SShri Abhyankar 
3468cee9d6f2SShri Abhyankar #undef __FUNCT__
3469f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3470dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3471f26ec98cSKris Buschelman {
3472f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3473690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3474dfbe8321SBarry Smith   PetscErrorCode ierr;
3475690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3476f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3477f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3478f26ec98cSKris Buschelman 
3479f26ec98cSKris Buschelman   PetscFunctionBegin;
34801ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
34811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3482f26ec98cSKris Buschelman 
3483f26ec98cSKris Buschelman   {
3484f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3485f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3486690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3487f26ec98cSKris Buschelman 
3488f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3489f26ec98cSKris Buschelman     idx  = 0;
3490f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3491f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3492f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3493f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3494f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3495f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3496f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3497f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3498f26ec98cSKris Buschelman       idx   +=  4;
3499f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3500f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3501f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3502f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3503f26ec98cSKris Buschelman       while (nz--) {
3504f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3505f26ec98cSKris Buschelman         x1  = t[jdx];
3506f26ec98cSKris Buschelman         x2  = t[1+jdx];
3507f26ec98cSKris Buschelman         x3  = t[2+jdx];
3508f26ec98cSKris Buschelman         x4  = t[3+jdx];
3509f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3510f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3511f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3512f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3513f26ec98cSKris Buschelman         v    += 16;
3514f26ec98cSKris Buschelman       }
3515f26ec98cSKris Buschelman       t[idx]   = s1;
3516f26ec98cSKris Buschelman       t[1+idx] = s2;
3517f26ec98cSKris Buschelman       t[2+idx] = s3;
3518f26ec98cSKris Buschelman       t[3+idx] = s4;
3519f26ec98cSKris Buschelman     }
3520f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3521f26ec98cSKris Buschelman     idt = 4*(n-1);
3522f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3523f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3524f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3525f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3526f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3527f26ec98cSKris Buschelman       s1   = t[idt];
3528f26ec98cSKris Buschelman       s2   = t[1+idt];
3529f26ec98cSKris Buschelman       s3   = t[2+idt];
3530f26ec98cSKris Buschelman       s4   = t[3+idt];
3531f26ec98cSKris Buschelman       while (nz--) {
3532f26ec98cSKris Buschelman         idx = 4*(*vi++);
3533f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3534f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3535f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3536f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3537f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3538f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3539f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3540f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3541f26ec98cSKris Buschelman         v    += 16;
3542f26ec98cSKris Buschelman       }
3543f26ec98cSKris Buschelman       v        = aa + ai16;
3544f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3545f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3546f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3547f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3548f26ec98cSKris Buschelman       idt -= 4;
3549f26ec98cSKris Buschelman     }
3550f26ec98cSKris Buschelman   }
3551f26ec98cSKris Buschelman 
35521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
35531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3554dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3555f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3556f26ec98cSKris Buschelman }
3557f26ec98cSKris Buschelman 
35583660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
35593660e330SKris Buschelman 
35603660e330SKris Buschelman #include PETSC_HAVE_SSE
35613660e330SKris Buschelman #undef __FUNCT__
35627cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3563dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
35643660e330SKris Buschelman {
35653660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
35662aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3567dfbe8321SBarry Smith   PetscErrorCode ierr;
3568dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
35693660e330SKris Buschelman   MatScalar      *aa=a->a;
357087828ca2SBarry Smith   PetscScalar    *x,*b;
35713660e330SKris Buschelman 
35723660e330SKris Buschelman   PetscFunctionBegin;
35733660e330SKris Buschelman   SSE_SCOPE_BEGIN;
35743660e330SKris Buschelman   /*
35753660e330SKris Buschelman      Note: This code currently uses demotion of double
35763660e330SKris Buschelman      to float when performing the mixed-mode computation.
35773660e330SKris Buschelman      This may not be numerically reasonable for all applications.
35783660e330SKris Buschelman   */
35793660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
35803660e330SKris Buschelman 
35811ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
35821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
35833660e330SKris Buschelman   {
3584eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3585eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
35862aa5897fSKris Buschelman     int            nz,i,idt,ai16;
35872aa5897fSKris Buschelman     unsigned int   jdx,idx;
35882aa5897fSKris Buschelman     unsigned short *vi;
3589eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
35903660e330SKris Buschelman 
3591eb05f457SKris Buschelman     /* First block is the identity. */
35923660e330SKris Buschelman     idx  = 0;
3593eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
35942aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
35953660e330SKris Buschelman 
35963660e330SKris Buschelman     for (i=1; i<n;) {
35973660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
35983660e330SKris Buschelman       vi   =  aj      + ai[i];
35993660e330SKris Buschelman       nz   =  diag[i] - ai[i];
36003660e330SKris Buschelman       idx +=  4;
36013660e330SKris Buschelman 
3602eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3603eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3604eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
36053660e330SKris Buschelman 
36063660e330SKris Buschelman       while (nz--) {
36073660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
36082aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
36093660e330SKris Buschelman 
36103660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3611eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
36123660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
36133660e330SKris Buschelman 
36143660e330SKris Buschelman           /* First Column */
36153660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
36163660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
36173660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
36183660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
36193660e330SKris Buschelman 
36203660e330SKris Buschelman           /* Second Column */
36213660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
36223660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
36233660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
36243660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
36253660e330SKris Buschelman 
36263660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
36273660e330SKris Buschelman 
36283660e330SKris Buschelman           /* Third Column */
36293660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
36303660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
36313660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
36323660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
36333660e330SKris Buschelman 
36343660e330SKris Buschelman           /* Fourth Column */
36353660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
36363660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
36373660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
36383660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
36393660e330SKris Buschelman         SSE_INLINE_END_2
36403660e330SKris Buschelman 
36413660e330SKris Buschelman         v  += 16;
36423660e330SKris Buschelman       }
36433660e330SKris Buschelman       v    =  aa + 16*ai[++i];
36443660e330SKris Buschelman       PREFETCH_NTA(v);
3645eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
36463660e330SKris Buschelman     }
3647eb05f457SKris Buschelman 
3648eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3649eb05f457SKris Buschelman 
36503660e330SKris Buschelman     idt  = 4*(n-1);
36513660e330SKris Buschelman     ai16 = 16*diag[n-1];
36523660e330SKris Buschelman     v    = aa + ai16 + 16;
36533660e330SKris Buschelman     for (i=n-1; i>=0;){
36543660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
36553660e330SKris Buschelman       vi = aj + diag[i] + 1;
36563660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
36573660e330SKris Buschelman 
3658eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
36593660e330SKris Buschelman 
36603660e330SKris Buschelman       while (nz--) {
36613660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
36622aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
36633660e330SKris Buschelman 
36643660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3665eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
36663660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
36673660e330SKris Buschelman 
36683660e330SKris Buschelman           /* First Column */
36693660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
36703660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
36713660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
36723660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
36733660e330SKris Buschelman 
36743660e330SKris Buschelman           /* Second Column */
36753660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
36763660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
36773660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
36783660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
36793660e330SKris Buschelman 
36803660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
36813660e330SKris Buschelman 
36823660e330SKris Buschelman           /* Third Column */
36833660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
36843660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
36853660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
36863660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
36873660e330SKris Buschelman 
36883660e330SKris Buschelman           /* Fourth Column */
36893660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
36903660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
36913660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
36923660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
36933660e330SKris Buschelman         SSE_INLINE_END_2
36943660e330SKris Buschelman         v  += 16;
36953660e330SKris Buschelman       }
36963660e330SKris Buschelman       v    = aa + ai16;
36973660e330SKris Buschelman       ai16 = 16*diag[--i];
36983660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
36993660e330SKris Buschelman       /*
37003660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
37013660e330SKris Buschelman          which was inverted as part of the factorization
37023660e330SKris Buschelman       */
3703eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
37043660e330SKris Buschelman         /* First Column */
37053660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
37063660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
37073660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
37083660e330SKris Buschelman 
37093660e330SKris Buschelman         /* Second Column */
37103660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
37113660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
37123660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
37133660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
37143660e330SKris Buschelman 
37153660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
37163660e330SKris Buschelman 
37173660e330SKris Buschelman         /* Third Column */
37183660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
37193660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
37203660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
37213660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
37223660e330SKris Buschelman 
37233660e330SKris Buschelman         /* Fourth Column */
37243660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
37253660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
37263660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
37273660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
37283660e330SKris Buschelman 
37293660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
37303660e330SKris Buschelman       SSE_INLINE_END_3
37313660e330SKris Buschelman 
37323660e330SKris Buschelman       v    = aa + ai16 + 16;
37333660e330SKris Buschelman       idt -= 4;
37343660e330SKris Buschelman     }
3735eb05f457SKris Buschelman 
3736eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3737eb05f457SKris Buschelman     idt = 4*(n-1);
3738eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3739eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3740eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3741eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3742eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3743eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3744eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3745eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3746eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
374754693613SKris Buschelman       idt -= 4;
37483660e330SKris Buschelman     }
3749eb05f457SKris Buschelman 
3750eb05f457SKris Buschelman   } /* End of artificial scope. */
37511ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
37521ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3753dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
37543660e330SKris Buschelman   SSE_SCOPE_END;
37553660e330SKris Buschelman   PetscFunctionReturn(0);
37563660e330SKris Buschelman }
37573660e330SKris Buschelman 
37587cf1b8d3SKris Buschelman #undef __FUNCT__
37597cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3760dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
37617cf1b8d3SKris Buschelman {
37627cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
37637cf1b8d3SKris Buschelman   int            *aj=a->j;
3764dfbe8321SBarry Smith   PetscErrorCode ierr;
3765dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
37667cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
37677cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
37687cf1b8d3SKris Buschelman 
37697cf1b8d3SKris Buschelman   PetscFunctionBegin;
37707cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
37717cf1b8d3SKris Buschelman   /*
37727cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
37737cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
37747cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
37757cf1b8d3SKris Buschelman   */
37767cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
37777cf1b8d3SKris Buschelman 
37781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
37791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
37807cf1b8d3SKris Buschelman   {
37817cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
37827cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
37837cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
37847cf1b8d3SKris Buschelman     int       jdx,idx;
37857cf1b8d3SKris Buschelman     int       *vi;
37867cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
37877cf1b8d3SKris Buschelman 
37887cf1b8d3SKris Buschelman     /* First block is the identity. */
37897cf1b8d3SKris Buschelman     idx  = 0;
37907cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
37917cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
37927cf1b8d3SKris Buschelman 
37937cf1b8d3SKris Buschelman     for (i=1; i<n;) {
37947cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
37957cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
37967cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
37977cf1b8d3SKris Buschelman       idx +=  4;
37987cf1b8d3SKris Buschelman 
37997cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
38007cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
38017cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
38027cf1b8d3SKris Buschelman 
38037cf1b8d3SKris Buschelman       while (nz--) {
38047cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
38057cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
38067cf1b8d3SKris Buschelman /*          jdx = *vi++; */
38077cf1b8d3SKris Buschelman 
38087cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
38097cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
38107cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
38117cf1b8d3SKris Buschelman 
38127cf1b8d3SKris Buschelman           /* First Column */
38137cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
38147cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
38157cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
38167cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
38177cf1b8d3SKris Buschelman 
38187cf1b8d3SKris Buschelman           /* Second Column */
38197cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
38207cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
38217cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
38227cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
38237cf1b8d3SKris Buschelman 
38247cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
38257cf1b8d3SKris Buschelman 
38267cf1b8d3SKris Buschelman           /* Third Column */
38277cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
38287cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
38297cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
38307cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
38317cf1b8d3SKris Buschelman 
38327cf1b8d3SKris Buschelman           /* Fourth Column */
38337cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
38347cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
38357cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
38367cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
38377cf1b8d3SKris Buschelman         SSE_INLINE_END_2
38387cf1b8d3SKris Buschelman 
38397cf1b8d3SKris Buschelman         v  += 16;
38407cf1b8d3SKris Buschelman       }
38417cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
38427cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
38437cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
38447cf1b8d3SKris Buschelman     }
38457cf1b8d3SKris Buschelman 
38467cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
38477cf1b8d3SKris Buschelman 
38487cf1b8d3SKris Buschelman     idt  = 4*(n-1);
38497cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
38507cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
38517cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
38527cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
38537cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
38547cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
38557cf1b8d3SKris Buschelman 
38567cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
38577cf1b8d3SKris Buschelman 
38587cf1b8d3SKris Buschelman       while (nz--) {
38597cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
38607cf1b8d3SKris Buschelman         idx = 4*(*vi++);
38617cf1b8d3SKris Buschelman /*          idx = *vi++; */
38627cf1b8d3SKris Buschelman 
38637cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
38647cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
38657cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
38667cf1b8d3SKris Buschelman 
38677cf1b8d3SKris Buschelman           /* First Column */
38687cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
38697cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
38707cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
38717cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
38727cf1b8d3SKris Buschelman 
38737cf1b8d3SKris Buschelman           /* Second Column */
38747cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
38757cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
38767cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
38777cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
38787cf1b8d3SKris Buschelman 
38797cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
38807cf1b8d3SKris Buschelman 
38817cf1b8d3SKris Buschelman           /* Third Column */
38827cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
38837cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
38847cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
38857cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
38867cf1b8d3SKris Buschelman 
38877cf1b8d3SKris Buschelman           /* Fourth Column */
38887cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
38897cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
38907cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
38917cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
38927cf1b8d3SKris Buschelman         SSE_INLINE_END_2
38937cf1b8d3SKris Buschelman         v  += 16;
38947cf1b8d3SKris Buschelman       }
38957cf1b8d3SKris Buschelman       v    = aa + ai16;
38967cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
38977cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
38987cf1b8d3SKris Buschelman       /*
38997cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
39007cf1b8d3SKris Buschelman          which was inverted as part of the factorization
39017cf1b8d3SKris Buschelman       */
39027cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
39037cf1b8d3SKris Buschelman         /* First Column */
39047cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
39057cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
39067cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
39077cf1b8d3SKris Buschelman 
39087cf1b8d3SKris Buschelman         /* Second Column */
39097cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
39107cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
39117cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
39127cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
39137cf1b8d3SKris Buschelman 
39147cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
39157cf1b8d3SKris Buschelman 
39167cf1b8d3SKris Buschelman         /* Third Column */
39177cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
39187cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
39197cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
39207cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
39217cf1b8d3SKris Buschelman 
39227cf1b8d3SKris Buschelman         /* Fourth Column */
39237cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
39247cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
39257cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
39267cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
39277cf1b8d3SKris Buschelman 
39287cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
39297cf1b8d3SKris Buschelman       SSE_INLINE_END_3
39307cf1b8d3SKris Buschelman 
39317cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
39327cf1b8d3SKris Buschelman       idt -= 4;
39337cf1b8d3SKris Buschelman     }
39347cf1b8d3SKris Buschelman 
39357cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
39367cf1b8d3SKris Buschelman     idt = 4*(n-1);
39377cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
39387cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
39397cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
39407cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
39417cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
39427cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
39437cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
39447cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
39457cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
39467cf1b8d3SKris Buschelman       idt -= 4;
39477cf1b8d3SKris Buschelman     }
39487cf1b8d3SKris Buschelman 
39497cf1b8d3SKris Buschelman   } /* End of artificial scope. */
39501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
39511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3952dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
39537cf1b8d3SKris Buschelman   SSE_SCOPE_END;
39547cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
39557cf1b8d3SKris Buschelman }
39567cf1b8d3SKris Buschelman 
39573660e330SKris Buschelman #endif
39588f690400SShri Abhyankar 
39594a2ae208SSatish Balay #undef __FUNCT__
39604a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3961dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
39624e2b4712SSatish Balay {
39634e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
39644e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
39656849ba73SBarry Smith   PetscErrorCode    ierr;
39665d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
39675d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3968d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3969d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3970d9fead3dSBarry Smith   const PetscScalar *b;
39714e2b4712SSatish Balay 
39724e2b4712SSatish Balay   PetscFunctionBegin;
3973d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3975f1af5d2fSBarry Smith   t  = a->solve_work;
39764e2b4712SSatish Balay 
39774e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
39784e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
39794e2b4712SSatish Balay 
39804e2b4712SSatish Balay   /* forward solve the lower triangular */
39814e2b4712SSatish Balay   idx    = 3*(*r++);
3982f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
39834e2b4712SSatish Balay   for (i=1; i<n; i++) {
39844e2b4712SSatish Balay     v     = aa + 9*ai[i];
39854e2b4712SSatish Balay     vi    = aj + ai[i];
39864e2b4712SSatish Balay     nz    = diag[i] - ai[i];
39874e2b4712SSatish Balay     idx   = 3*(*r++);
3988f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
39894e2b4712SSatish Balay     while (nz--) {
39904e2b4712SSatish Balay       idx   = 3*(*vi++);
3991f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3992f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3993f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3994f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
39954e2b4712SSatish Balay       v += 9;
39964e2b4712SSatish Balay     }
39974e2b4712SSatish Balay     idx = 3*i;
3998f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
39994e2b4712SSatish Balay   }
40004e2b4712SSatish Balay   /* backward solve the upper triangular */
40014e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
40024e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
40034e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
40044e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
40054e2b4712SSatish Balay     idt  = 3*i;
4006f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
40074e2b4712SSatish Balay     while (nz--) {
40084e2b4712SSatish Balay       idx   = 3*(*vi++);
4009f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4010f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4011f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4012f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40134e2b4712SSatish Balay       v += 9;
40144e2b4712SSatish Balay     }
40154e2b4712SSatish Balay     idc = 3*(*c--);
40164e2b4712SSatish Balay     v   = aa + 9*diag[i];
4017f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4018f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4019f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
40204e2b4712SSatish Balay   }
40214e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
40224e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4023d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40241ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4025dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
40264e2b4712SSatish Balay   PetscFunctionReturn(0);
40274e2b4712SSatish Balay }
40284e2b4712SSatish Balay 
40298f690400SShri Abhyankar #undef __FUNCT__
40308f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
40318f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
40328f690400SShri Abhyankar {
40338f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
40348f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
40358f690400SShri Abhyankar   PetscErrorCode    ierr;
403629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
40378f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
40388f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
40398f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
40408f690400SShri Abhyankar   const PetscScalar *b;
40418f690400SShri Abhyankar 
40428f690400SShri Abhyankar   PetscFunctionBegin;
40438f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40448f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
40458f690400SShri Abhyankar   t  = a->solve_work;
40468f690400SShri Abhyankar 
40478f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
404829b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
40498f690400SShri Abhyankar 
40508f690400SShri Abhyankar   /* forward solve the lower triangular */
405129b92fc1SShri Abhyankar   idx    = 3*r[0];
40528f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
40538f690400SShri Abhyankar   for (i=1; i<n; i++) {
40548f690400SShri Abhyankar     v     = aa + 9*ai[i];
40558f690400SShri Abhyankar     vi    = aj + ai[i];
40568f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
405729b92fc1SShri Abhyankar     idx   = 3*r[i];
40588f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
405929b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
406029b92fc1SShri Abhyankar       idx   = 3*vi[m];
40618f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
40628f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
40638f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
40648f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40658f690400SShri Abhyankar       v += 9;
40668f690400SShri Abhyankar     }
40678f690400SShri Abhyankar     idx = 3*i;
40688f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
40698f690400SShri Abhyankar   }
40708f690400SShri Abhyankar   /* backward solve the upper triangular */
40718f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
40728f690400SShri Abhyankar     k    = 2*n-i;
40738f690400SShri Abhyankar     v    = aa + 9*ai[k];
40748f690400SShri Abhyankar     vi   = aj + ai[k];
40758f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
40768f690400SShri Abhyankar     idt  = 3*i;
40778f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
407829b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
407929b92fc1SShri Abhyankar       idx   = 3*vi[m];
40808f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
40818f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
40828f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
40838f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40848f690400SShri Abhyankar       v += 9;
40858f690400SShri Abhyankar     }
408629b92fc1SShri Abhyankar     idc = 3*c[i];
40878f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
40888f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
40898f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
40908f690400SShri Abhyankar   }
40918f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
40928f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40938f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40948f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
40958f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
40968f690400SShri Abhyankar   PetscFunctionReturn(0);
40978f690400SShri Abhyankar }
40988f690400SShri Abhyankar 
40990c4413a7SShri Abhyankar #undef __FUNCT__
41000c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
41010c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
41020c4413a7SShri Abhyankar {
41030c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
41040c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
41050c4413a7SShri Abhyankar   PetscErrorCode    ierr;
41060c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
41070c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
41080c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
41090c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
41100c4413a7SShri Abhyankar   const PetscScalar *b;
41110c4413a7SShri Abhyankar 
41120c4413a7SShri Abhyankar   PetscFunctionBegin;
41130c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41140c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
41150c4413a7SShri Abhyankar   t  = a->solve_work;
41160c4413a7SShri Abhyankar 
41170c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
41180c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
41190c4413a7SShri Abhyankar 
41200c4413a7SShri Abhyankar   /* forward solve the lower triangular */
41210c4413a7SShri Abhyankar   idx    = 3*r[0];
41220c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
41230c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
41240c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
41250c4413a7SShri Abhyankar     vi    = aj + ai[i];
41260c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
41270c4413a7SShri Abhyankar     idx   = 3*r[i];
41280c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
41290c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
41300c4413a7SShri Abhyankar       idx   = 3*vi[m];
41310c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
41320c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
41330c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
41340c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
41350c4413a7SShri Abhyankar       v += 9;
41360c4413a7SShri Abhyankar     }
41370c4413a7SShri Abhyankar     idx = 3*i;
41380c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
41390c4413a7SShri Abhyankar   }
41400c4413a7SShri Abhyankar   /* backward solve the upper triangular */
41410c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
41420c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
41430c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
41440c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
41450c4413a7SShri Abhyankar     idt  = 3*i;
41460c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
41470c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
41480c4413a7SShri Abhyankar       idx   = 3*vi[m];
41490c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
41500c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
41510c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
41520c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
41530c4413a7SShri Abhyankar       v += 9;
41540c4413a7SShri Abhyankar     }
41550c4413a7SShri Abhyankar     idc = 3*c[i];
41560c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
41570c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
41580c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
41590c4413a7SShri Abhyankar   }
41600c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
41610c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
41620c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41630c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
41640c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
41650c4413a7SShri Abhyankar   PetscFunctionReturn(0);
41660c4413a7SShri Abhyankar }
41670c4413a7SShri Abhyankar 
416815091d37SBarry Smith /*
416915091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
417015091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
417115091d37SBarry Smith */
41724a2ae208SSatish Balay #undef __FUNCT__
41734a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4174dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
417515091d37SBarry Smith {
417615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4177690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4178dfbe8321SBarry Smith   PetscErrorCode    ierr;
4179690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4180d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4181d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4182d9fead3dSBarry Smith   const PetscScalar *b;
4183690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
418415091d37SBarry Smith 
418515091d37SBarry Smith   PetscFunctionBegin;
4186d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41871ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
418815091d37SBarry Smith 
418915091d37SBarry Smith   /* forward solve the lower triangular */
419015091d37SBarry Smith   idx    = 0;
419115091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
419215091d37SBarry Smith   for (i=1; i<n; i++) {
419315091d37SBarry Smith     v     =  aa      + 9*ai[i];
419415091d37SBarry Smith     vi    =  aj      + ai[i];
419515091d37SBarry Smith     nz    =  diag[i] - ai[i];
419615091d37SBarry Smith     idx   +=  3;
4197f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
419815091d37SBarry Smith     while (nz--) {
419915091d37SBarry Smith       jdx   = 3*(*vi++);
420015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4201f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4202f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4203f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
420415091d37SBarry Smith       v    += 9;
420515091d37SBarry Smith     }
4206f1af5d2fSBarry Smith     x[idx]   = s1;
4207f1af5d2fSBarry Smith     x[1+idx] = s2;
4208f1af5d2fSBarry Smith     x[2+idx] = s3;
420915091d37SBarry Smith   }
421015091d37SBarry Smith   /* backward solve the upper triangular */
421115091d37SBarry Smith   for (i=n-1; i>=0; i--){
421215091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
421315091d37SBarry Smith     vi   = aj + diag[i] + 1;
421415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
421515091d37SBarry Smith     idt  = 3*i;
4216f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4217f1af5d2fSBarry Smith     s3 = x[2+idt];
421815091d37SBarry Smith     while (nz--) {
421915091d37SBarry Smith       idx   = 3*(*vi++);
422015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4221f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4222f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4223f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
422415091d37SBarry Smith       v    += 9;
422515091d37SBarry Smith     }
422615091d37SBarry Smith     v        = aa +  9*diag[i];
4227f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4228f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4229f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
423015091d37SBarry Smith   }
423115091d37SBarry Smith 
4232d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4234dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
423515091d37SBarry Smith   PetscFunctionReturn(0);
423615091d37SBarry Smith }
423715091d37SBarry Smith 
42384a2ae208SSatish Balay #undef __FUNCT__
4239cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4240cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4241cee9d6f2SShri Abhyankar {
4242cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4243ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4244cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4245cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
4246cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4247cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4248cee9d6f2SShri Abhyankar     PetscScalar       *x;
4249cee9d6f2SShri Abhyankar     const PetscScalar *b;
4250cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4251cee9d6f2SShri Abhyankar 
4252cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4253cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4254cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4255cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4256cee9d6f2SShri Abhyankar     idx    = 0;
4257cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4258cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4259cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
4260cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4261cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4262cee9d6f2SShri Abhyankar       idx   = bs*i;
4263cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4264ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4265ce3d78c0SShri Abhyankar          jdx   = bs*vi[k];
4266cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4267cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4268cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4269cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4270cee9d6f2SShri Abhyankar 
4271cee9d6f2SShri Abhyankar           v   +=  bs2;
4272cee9d6f2SShri Abhyankar         }
4273cee9d6f2SShri Abhyankar 
4274cee9d6f2SShri Abhyankar        x[idx]   = s1;
4275cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4276cee9d6f2SShri Abhyankar        x[2+idx] = s3;
4277cee9d6f2SShri Abhyankar     }
4278cee9d6f2SShri Abhyankar 
4279cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4280cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4281cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
4282cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4283cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4284cee9d6f2SShri Abhyankar      idt = bs*i;
4285cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4286cee9d6f2SShri Abhyankar 
4287ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4288ce3d78c0SShri Abhyankar        idx   = bs*vi[k];
4289cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4290cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4291cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4292cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4293cee9d6f2SShri Abhyankar 
4294cee9d6f2SShri Abhyankar         v   +=  bs2;
4295cee9d6f2SShri Abhyankar     }
4296cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4297cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4298cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4299cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4300cee9d6f2SShri Abhyankar 
4301cee9d6f2SShri Abhyankar   }
4302cee9d6f2SShri Abhyankar 
4303cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4304cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4305cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4306cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4307cee9d6f2SShri Abhyankar }
4308cee9d6f2SShri Abhyankar 
4309cee9d6f2SShri Abhyankar #undef __FUNCT__
4310b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4311b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4312b2b2dd24SShri Abhyankar {
4313b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4314b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4315b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4316b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
4317b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4318b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4319b2b2dd24SShri Abhyankar     PetscScalar       *x;
4320b2b2dd24SShri Abhyankar     const PetscScalar *b;
4321b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4322b2b2dd24SShri Abhyankar 
4323b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4324b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4325b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4326b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4327b2b2dd24SShri Abhyankar     idx    = 0;
4328b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4329b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4330b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4331b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4332b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4333b2b2dd24SShri Abhyankar       idx   = bs*i;
4334b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4335b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4336b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4337b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4338b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4339b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4340b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4341b2b2dd24SShri Abhyankar 
4342b2b2dd24SShri Abhyankar           v   +=  bs2;
4343b2b2dd24SShri Abhyankar         }
4344b2b2dd24SShri Abhyankar 
4345b2b2dd24SShri Abhyankar        x[idx]   = s1;
4346b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4347b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4348b2b2dd24SShri Abhyankar     }
4349b2b2dd24SShri Abhyankar 
4350b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4351b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4352b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4353b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4354b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4355b2b2dd24SShri Abhyankar      idt = bs*i;
4356b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4357b2b2dd24SShri Abhyankar 
4358b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4359b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4360b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4361b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4362b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4363b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4364b2b2dd24SShri Abhyankar 
4365b2b2dd24SShri Abhyankar         v   +=  bs2;
4366b2b2dd24SShri Abhyankar     }
4367b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4368b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4369b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4370b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4371b2b2dd24SShri Abhyankar 
4372b2b2dd24SShri Abhyankar   }
4373b2b2dd24SShri Abhyankar 
4374b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4375b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4376b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4377b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4378b2b2dd24SShri Abhyankar }
4379b2b2dd24SShri Abhyankar 
4380b2b2dd24SShri Abhyankar #undef __FUNCT__
43814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
43834e2b4712SSatish Balay {
43844e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
43854e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
43866849ba73SBarry Smith   PetscErrorCode    ierr;
43875d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
43885d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4389d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4390d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4391d9fead3dSBarry Smith   const PetscScalar *b;
43924e2b4712SSatish Balay 
43934e2b4712SSatish Balay   PetscFunctionBegin;
4394d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43951ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4396f1af5d2fSBarry Smith   t  = a->solve_work;
43974e2b4712SSatish Balay 
43984e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
43994e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
44004e2b4712SSatish Balay 
44014e2b4712SSatish Balay   /* forward solve the lower triangular */
44024e2b4712SSatish Balay   idx    = 2*(*r++);
4403f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
44044e2b4712SSatish Balay   for (i=1; i<n; i++) {
44054e2b4712SSatish Balay     v     = aa + 4*ai[i];
44064e2b4712SSatish Balay     vi    = aj + ai[i];
44074e2b4712SSatish Balay     nz    = diag[i] - ai[i];
44084e2b4712SSatish Balay     idx   = 2*(*r++);
4409f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
44104e2b4712SSatish Balay     while (nz--) {
44114e2b4712SSatish Balay       idx   = 2*(*vi++);
4412f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4413f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4414f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
44154e2b4712SSatish Balay       v += 4;
44164e2b4712SSatish Balay     }
44174e2b4712SSatish Balay     idx = 2*i;
4418f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
44194e2b4712SSatish Balay   }
44204e2b4712SSatish Balay   /* backward solve the upper triangular */
44214e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
44224e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
44234e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
44244e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
44254e2b4712SSatish Balay     idt  = 2*i;
4426f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
44274e2b4712SSatish Balay     while (nz--) {
44284e2b4712SSatish Balay       idx   = 2*(*vi++);
4429f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4430f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4431f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
44324e2b4712SSatish Balay       v += 4;
44334e2b4712SSatish Balay     }
44344e2b4712SSatish Balay     idc = 2*(*c--);
44354e2b4712SSatish Balay     v   = aa + 4*diag[i];
4436f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4437f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
44384e2b4712SSatish Balay   }
44394e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
44404e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4441d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44421ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4443dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
44444e2b4712SSatish Balay   PetscFunctionReturn(0);
44454e2b4712SSatish Balay }
44464e2b4712SSatish Balay 
44478f690400SShri Abhyankar #undef __FUNCT__
44488f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
44498f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
44508f690400SShri Abhyankar {
44518f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
44528f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
44538f690400SShri Abhyankar   PetscErrorCode    ierr;
445429b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
44558f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
44568f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
44578f690400SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
44588f690400SShri Abhyankar   const PetscScalar *b;
44598f690400SShri Abhyankar 
44608f690400SShri Abhyankar   PetscFunctionBegin;
44618f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44628f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
44638f690400SShri Abhyankar   t  = a->solve_work;
44648f690400SShri Abhyankar 
44658f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
446629b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
44678f690400SShri Abhyankar 
44688f690400SShri Abhyankar   /* forward solve the lower triangular */
446929b92fc1SShri Abhyankar   idx    = 2*r[0];
44708f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
44718f690400SShri Abhyankar   for (i=1; i<n; i++) {
44728f690400SShri Abhyankar     v     = aa + 4*ai[i];
44738f690400SShri Abhyankar     vi    = aj + ai[i];
44748f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
447529b92fc1SShri Abhyankar     idx   = 2*r[i];
44768f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
447729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
447829b92fc1SShri Abhyankar       jdx   = 2*vi[m];
44798f690400SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
44808f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
44818f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
44828f690400SShri Abhyankar       v += 4;
44838f690400SShri Abhyankar     }
44848f690400SShri Abhyankar     idx = 2*i;
44858f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
44868f690400SShri Abhyankar   }
44878f690400SShri Abhyankar   /* backward solve the upper triangular */
44888f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
44898f690400SShri Abhyankar     k = 2*n-i;
44908f690400SShri Abhyankar     v    = aa + 4*ai[k];
44918f690400SShri Abhyankar     vi   = aj + ai[k];
44928f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
44938f690400SShri Abhyankar     idt  = 2*i;
44948f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
449529b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
449629b92fc1SShri Abhyankar       idx   = 2*vi[m];
44978f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
44988f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
44998f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
45008f690400SShri Abhyankar       v += 4;
45018f690400SShri Abhyankar     }
450229b92fc1SShri Abhyankar     idc = 2*c[i];
45038f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
45048f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
45058f690400SShri Abhyankar   }
45068f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45078f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
45088f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45098f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
45108f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
45118f690400SShri Abhyankar   PetscFunctionReturn(0);
45128f690400SShri Abhyankar }
45138f690400SShri Abhyankar 
45140c4413a7SShri Abhyankar #undef __FUNCT__
45150c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
45160c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
45170c4413a7SShri Abhyankar {
45180c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
45190c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
45200c4413a7SShri Abhyankar   PetscErrorCode    ierr;
45210c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
45220c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
45230c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
45240c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
45250c4413a7SShri Abhyankar   const PetscScalar *b;
45260c4413a7SShri Abhyankar 
45270c4413a7SShri Abhyankar   PetscFunctionBegin;
45280c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45290c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
45300c4413a7SShri Abhyankar   t  = a->solve_work;
45310c4413a7SShri Abhyankar 
45320c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
45330c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
45340c4413a7SShri Abhyankar 
45350c4413a7SShri Abhyankar   /* forward solve the lower triangular */
45360c4413a7SShri Abhyankar   idx    = 2*r[0];
45370c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
45380c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
45390c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
45400c4413a7SShri Abhyankar     vi    = aj + ai[i];
45410c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
45420c4413a7SShri Abhyankar     idx   = 2*r[i];
45430c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
45440c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
45450c4413a7SShri Abhyankar       jdx   = 2*vi[m];
45460c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
45470c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
45480c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
45490c4413a7SShri Abhyankar       v += 4;
45500c4413a7SShri Abhyankar     }
45510c4413a7SShri Abhyankar     idx = 2*i;
45520c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
45530c4413a7SShri Abhyankar   }
45540c4413a7SShri Abhyankar   /* backward solve the upper triangular */
45550c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
45560c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
45570c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
45580c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
45590c4413a7SShri Abhyankar     idt  = 2*i;
45600c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
45610c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
45620c4413a7SShri Abhyankar       idx   = 2*vi[m];
45630c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
45640c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
45650c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
45660c4413a7SShri Abhyankar       v += 4;
45670c4413a7SShri Abhyankar     }
45680c4413a7SShri Abhyankar     idc = 2*c[i];
45690c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
45700c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
45710c4413a7SShri Abhyankar   }
45720c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45730c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
45740c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45750c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
45760c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
45770c4413a7SShri Abhyankar   PetscFunctionReturn(0);
45780c4413a7SShri Abhyankar }
45798f690400SShri Abhyankar 
458015091d37SBarry Smith /*
458115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
458215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
458315091d37SBarry Smith */
45844a2ae208SSatish Balay #undef __FUNCT__
45854a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4586dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
458715091d37SBarry Smith {
458815091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4589690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4590dfbe8321SBarry Smith   PetscErrorCode    ierr;
4591690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4592d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4593d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
4594d9fead3dSBarry Smith   const PetscScalar *b;
4595690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
459615091d37SBarry Smith 
459715091d37SBarry Smith   PetscFunctionBegin;
4598d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45991ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
460015091d37SBarry Smith 
460115091d37SBarry Smith   /* forward solve the lower triangular */
460215091d37SBarry Smith   idx    = 0;
460315091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
460415091d37SBarry Smith   for (i=1; i<n; i++) {
460515091d37SBarry Smith     v     =  aa      + 4*ai[i];
460615091d37SBarry Smith     vi    =  aj      + ai[i];
460715091d37SBarry Smith     nz    =  diag[i] - ai[i];
460815091d37SBarry Smith     idx   +=  2;
4609f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
461015091d37SBarry Smith     while (nz--) {
461115091d37SBarry Smith       jdx   = 2*(*vi++);
461215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
4613f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4614f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
461515091d37SBarry Smith       v    += 4;
461615091d37SBarry Smith     }
4617f1af5d2fSBarry Smith     x[idx]   = s1;
4618f1af5d2fSBarry Smith     x[1+idx] = s2;
461915091d37SBarry Smith   }
462015091d37SBarry Smith   /* backward solve the upper triangular */
462115091d37SBarry Smith   for (i=n-1; i>=0; i--){
462215091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
462315091d37SBarry Smith     vi   = aj + diag[i] + 1;
462415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
462515091d37SBarry Smith     idt  = 2*i;
4626f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
462715091d37SBarry Smith     while (nz--) {
462815091d37SBarry Smith       idx   = 2*(*vi++);
462915091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
4630f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4631f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
463215091d37SBarry Smith       v    += 4;
463315091d37SBarry Smith     }
463415091d37SBarry Smith     v        = aa +  4*diag[i];
4635f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
4636f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
463715091d37SBarry Smith   }
463815091d37SBarry Smith 
4639d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4641dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
464215091d37SBarry Smith   PetscFunctionReturn(0);
464315091d37SBarry Smith }
464415091d37SBarry Smith 
46454a2ae208SSatish Balay #undef __FUNCT__
4646cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4647cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4648cee9d6f2SShri Abhyankar {
4649cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4650ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4651cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4652cee9d6f2SShri Abhyankar     PetscInt          jdx;
4653cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4654cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4655cee9d6f2SShri Abhyankar     const PetscScalar *b;
4656cee9d6f2SShri Abhyankar 
4657cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4658cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4659cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4660cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4661cee9d6f2SShri Abhyankar     idx    = 0;
4662cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4663cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4664cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
4665cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4666cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4667cee9d6f2SShri Abhyankar        idx  = 2*i;
4668cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4669ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4670ce3d78c0SShri Abhyankar          jdx   = 2*vi[k];
4671cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4672cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4673cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4674cee9d6f2SShri Abhyankar            v   +=  4;
4675cee9d6f2SShri Abhyankar         }
4676cee9d6f2SShri Abhyankar        x[idx]   = s1;
4677cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4678cee9d6f2SShri Abhyankar     }
4679cee9d6f2SShri Abhyankar 
4680cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4681cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4682cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
4683cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4684cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4685cee9d6f2SShri Abhyankar      idt = 2*i;
4686cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4687ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4688ce3d78c0SShri Abhyankar       idx   = 2*vi[k];
4689cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4690cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4691cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4692cee9d6f2SShri Abhyankar          v    += 4;
4693cee9d6f2SShri Abhyankar     }
4694cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4695cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4696cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4697cee9d6f2SShri Abhyankar   }
4698cee9d6f2SShri Abhyankar 
4699cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4700cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4701cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4702cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4703cee9d6f2SShri Abhyankar }
4704cee9d6f2SShri Abhyankar 
4705cee9d6f2SShri Abhyankar #undef __FUNCT__
4706b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4707b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4708b2b2dd24SShri Abhyankar {
4709b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4710b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4711b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4712b2b2dd24SShri Abhyankar     PetscInt          jdx;
4713b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4714b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4715b2b2dd24SShri Abhyankar     const PetscScalar *b;
4716b2b2dd24SShri Abhyankar 
4717b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4718b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4719b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4720b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4721b2b2dd24SShri Abhyankar     idx    = 0;
4722b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4723b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4724b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4725b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4726b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4727b2b2dd24SShri Abhyankar        idx  = 2*i;
4728b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4729b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4730b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4731b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4732b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4733b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4734b2b2dd24SShri Abhyankar            v   +=  4;
4735b2b2dd24SShri Abhyankar         }
4736b2b2dd24SShri Abhyankar        x[idx]   = s1;
4737b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4738b2b2dd24SShri Abhyankar     }
4739b2b2dd24SShri Abhyankar 
4740b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4741b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4742b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4743b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4744b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4745b2b2dd24SShri Abhyankar      idt = 2*i;
4746b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4747b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4748b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4749b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4750b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4751b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4752b2b2dd24SShri Abhyankar          v    += 4;
4753b2b2dd24SShri Abhyankar     }
4754b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4755b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4756b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4757b2b2dd24SShri Abhyankar   }
4758b2b2dd24SShri Abhyankar 
4759b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4760b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4761b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4762b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4763b2b2dd24SShri Abhyankar }
4764b2b2dd24SShri Abhyankar 
4765b2b2dd24SShri Abhyankar #undef __FUNCT__
47664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4767dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
47684e2b4712SSatish Balay {
47694e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
47704e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
47716849ba73SBarry Smith   PetscErrorCode ierr;
47725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
47735d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
47743f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
477587828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
47764e2b4712SSatish Balay 
47774e2b4712SSatish Balay   PetscFunctionBegin;
47784e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
47794e2b4712SSatish Balay 
47801ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
47811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4782f1af5d2fSBarry Smith   t  = a->solve_work;
47834e2b4712SSatish Balay 
47844e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47854e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
47864e2b4712SSatish Balay 
47874e2b4712SSatish Balay   /* forward solve the lower triangular */
4788f1af5d2fSBarry Smith   t[0] = b[*r++];
47894e2b4712SSatish Balay   for (i=1; i<n; i++) {
47904e2b4712SSatish Balay     v     = aa + ai[i];
47914e2b4712SSatish Balay     vi    = aj + ai[i];
47924e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4793f1af5d2fSBarry Smith     s1  = b[*r++];
47944e2b4712SSatish Balay     while (nz--) {
4795f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
47964e2b4712SSatish Balay     }
4797f1af5d2fSBarry Smith     t[i] = s1;
47984e2b4712SSatish Balay   }
47994e2b4712SSatish Balay   /* backward solve the upper triangular */
48004e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
48014e2b4712SSatish Balay     v    = aa + diag[i] + 1;
48024e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
48034e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4804f1af5d2fSBarry Smith     s1 = t[i];
48054e2b4712SSatish Balay     while (nz--) {
4806f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
48074e2b4712SSatish Balay     }
4808f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
48094e2b4712SSatish Balay   }
48104e2b4712SSatish Balay 
48114e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48124e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48131ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
48141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4815dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
48164e2b4712SSatish Balay   PetscFunctionReturn(0);
48174e2b4712SSatish Balay }
481815091d37SBarry Smith /*
481915091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
482015091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
482115091d37SBarry Smith */
48224a2ae208SSatish Balay #undef __FUNCT__
48234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4824dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
482515091d37SBarry Smith {
482615091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4827690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4828dfbe8321SBarry Smith   PetscErrorCode ierr;
4829690b6cddSBarry Smith   PetscInt       *diag = a->diag;
483015091d37SBarry Smith   MatScalar      *aa=a->a;
483187828ca2SBarry Smith   PetscScalar    *x,*b;
483287828ca2SBarry Smith   PetscScalar    s1,x1;
483315091d37SBarry Smith   MatScalar      *v;
4834690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
483515091d37SBarry Smith 
483615091d37SBarry Smith   PetscFunctionBegin;
48371ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
48381ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
483915091d37SBarry Smith 
484015091d37SBarry Smith   /* forward solve the lower triangular */
484115091d37SBarry Smith   idx    = 0;
484215091d37SBarry Smith   x[0]   = b[0];
484315091d37SBarry Smith   for (i=1; i<n; i++) {
484415091d37SBarry Smith     v     =  aa      + ai[i];
484515091d37SBarry Smith     vi    =  aj      + ai[i];
484615091d37SBarry Smith     nz    =  diag[i] - ai[i];
484715091d37SBarry Smith     idx   +=  1;
4848f1af5d2fSBarry Smith     s1  =  b[idx];
484915091d37SBarry Smith     while (nz--) {
485015091d37SBarry Smith       jdx   = *vi++;
485115091d37SBarry Smith       x1    = x[jdx];
4852f1af5d2fSBarry Smith       s1 -= v[0]*x1;
485315091d37SBarry Smith       v    += 1;
485415091d37SBarry Smith     }
4855f1af5d2fSBarry Smith     x[idx]   = s1;
485615091d37SBarry Smith   }
485715091d37SBarry Smith   /* backward solve the upper triangular */
485815091d37SBarry Smith   for (i=n-1; i>=0; i--){
485915091d37SBarry Smith     v    = aa + diag[i] + 1;
486015091d37SBarry Smith     vi   = aj + diag[i] + 1;
486115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
486215091d37SBarry Smith     idt  = i;
4863f1af5d2fSBarry Smith     s1 = x[idt];
486415091d37SBarry Smith     while (nz--) {
486515091d37SBarry Smith       idx   = *vi++;
486615091d37SBarry Smith       x1    = x[idx];
4867f1af5d2fSBarry Smith       s1 -= v[0]*x1;
486815091d37SBarry Smith       v    += 1;
486915091d37SBarry Smith     }
487015091d37SBarry Smith     v        = aa +  diag[i];
4871f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
487215091d37SBarry Smith   }
48731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
48741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4875dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
487615091d37SBarry Smith   PetscFunctionReturn(0);
487715091d37SBarry Smith }
48784e2b4712SSatish Balay 
48794e2b4712SSatish Balay /* ----------------------------------------------------------------*/
488016a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
48816bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
48826bce7ff8SHong Zhang 
48836bce7ff8SHong Zhang #undef __FUNCT__
48846bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
48856bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
48866bce7ff8SHong Zhang {
48876bce7ff8SHong Zhang   Mat            C=B;
48886bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
48896bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
48906bce7ff8SHong Zhang   PetscErrorCode ierr;
48916bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
48926bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
48936bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4894b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4895914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4896914a18a2SHong Zhang   MatScalar      *v_work;
48976bce7ff8SHong Zhang 
48986bce7ff8SHong Zhang   PetscFunctionBegin;
48996bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
49006bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4901914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4902914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
49036bce7ff8SHong Zhang   ics  = ic;
49046bce7ff8SHong Zhang 
4905914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
4906914a18a2SHong Zhang   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4907b588c5a2SHong Zhang   mwork    = v_work + bs;
4908b588c5a2SHong Zhang   v_pivots = (PetscInt*)(mwork + bs2);
4909914a18a2SHong Zhang 
49106bce7ff8SHong Zhang   for (i=0; i<n; i++){
49116bce7ff8SHong Zhang     /* zero rtmp */
49126bce7ff8SHong Zhang     /* L part */
49136bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
49146bce7ff8SHong Zhang     bjtmp = bj + bi[i];
4915914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4916914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4917914a18a2SHong Zhang     }
49186bce7ff8SHong Zhang 
49196bce7ff8SHong Zhang     /* U part */
49206bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
49216bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
4922914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4923914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4924914a18a2SHong Zhang     }
49256bce7ff8SHong Zhang 
49266bce7ff8SHong Zhang     /* load in initial (unfactored row) */
49276bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
49286bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
4929914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
49306bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4931914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
49326bce7ff8SHong Zhang     }
49336bce7ff8SHong Zhang 
49346bce7ff8SHong Zhang     /* elimination */
49356bce7ff8SHong Zhang     bjtmp = bj + bi[i];
49366bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
4937b1646270SShri Abhyankar     for(k=0;k < nzL;k++) {
4938b1646270SShri Abhyankar       row = bjtmp[k];
4939914a18a2SHong Zhang       pc = rtmp + bs2*row;
4940914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4941914a18a2SHong Zhang       if (flg) {
4942914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
4943b588c5a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
49446bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
4945914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
49466bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
4947914a18a2SHong Zhang         for (j=0; j<nz; j++) {
4948914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4949914a18a2SHong Zhang         }
4950b588c5a2SHong Zhang         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
49516bce7ff8SHong Zhang       }
49526bce7ff8SHong Zhang     }
49536bce7ff8SHong Zhang 
49546bce7ff8SHong Zhang     /* finished row so stick it into b->a */
49556bce7ff8SHong Zhang     /* L part */
4956914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
49576bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
49586bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
49596bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4960914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
49616bce7ff8SHong Zhang     }
49626bce7ff8SHong Zhang 
49636bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
4964914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
49656bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
4966914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4967914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4968914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
49696bce7ff8SHong Zhang 
49706bce7ff8SHong Zhang     /* U part */
4971914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
49726bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
49736bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
4974914a18a2SHong Zhang     for (j=0; j<nz; j++){
4975914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4976914a18a2SHong Zhang     }
49776bce7ff8SHong Zhang   }
49786bce7ff8SHong Zhang 
49796bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
49806bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
49816bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
49826bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
498327019359SHong Zhang 
49846bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
4985914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
49866bce7ff8SHong Zhang   PetscFunctionReturn(0);
49876bce7ff8SHong Zhang }
49886bce7ff8SHong Zhang 
49891a83e813SShri Abhyankar #undef __FUNCT__
49901a83e813SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2"
49911a83e813SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
49921a83e813SShri Abhyankar {
49931a83e813SShri Abhyankar   Mat            C=B;
49941a83e813SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
49951a83e813SShri Abhyankar   IS             isrow = b->row,isicol = b->icol;
49961a83e813SShri Abhyankar   PetscErrorCode ierr;
49971a83e813SShri Abhyankar   const PetscInt *r,*ic,*ics;
49981a83e813SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
49991a83e813SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
50001a83e813SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
50011a83e813SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
50021a83e813SShri Abhyankar   MatScalar      *v_work;
50031a83e813SShri Abhyankar 
50041a83e813SShri Abhyankar   PetscFunctionBegin;
50051a83e813SShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
50061a83e813SShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
50071a83e813SShri Abhyankar   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
50081a83e813SShri Abhyankar   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
50091a83e813SShri Abhyankar   ics  = ic;
50101a83e813SShri Abhyankar 
50111a83e813SShri Abhyankar   /* generate work space needed by dense LU factorization */
50121a83e813SShri Abhyankar   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
50131a83e813SShri Abhyankar   mwork    = v_work + bs;
50141a83e813SShri Abhyankar   v_pivots = (PetscInt*)(mwork + bs2);
50151a83e813SShri Abhyankar 
50161a83e813SShri Abhyankar   for (i=0; i<n; i++){
50171a83e813SShri Abhyankar     /* zero rtmp */
50181a83e813SShri Abhyankar     /* L part */
50191a83e813SShri Abhyankar     nz    = bi[i+1] - bi[i];
50201a83e813SShri Abhyankar     bjtmp = bj + bi[i];
50211a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
50221a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
50231a83e813SShri Abhyankar     }
50241a83e813SShri Abhyankar 
50251a83e813SShri Abhyankar     /* U part */
50261a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
50271a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
50281a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
50291a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
50301a83e813SShri Abhyankar     }
50311a83e813SShri Abhyankar 
50321a83e813SShri Abhyankar     /* load in initial (unfactored row) */
50331a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
50341a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
50351a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
50361a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
50371a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
50381a83e813SShri Abhyankar     }
50391a83e813SShri Abhyankar 
50401a83e813SShri Abhyankar     /* elimination */
50411a83e813SShri Abhyankar     bjtmp = bj + bi[i];
50421a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
50431a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
50441a83e813SShri Abhyankar       row = bjtmp[k];
50451a83e813SShri Abhyankar       pc = rtmp + bs2*row;
50461a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
50471a83e813SShri Abhyankar       if (flg) {
50481a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
50491a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
50501a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
50511a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
50521a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
50531a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
50541a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
50551a83e813SShri Abhyankar         }
50561a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
50571a83e813SShri Abhyankar       }
50581a83e813SShri Abhyankar     }
50591a83e813SShri Abhyankar 
50601a83e813SShri Abhyankar     /* finished row so stick it into b->a */
50611a83e813SShri Abhyankar     /* L part */
50621a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
50631a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
50641a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
50651a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
50661a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
50671a83e813SShri Abhyankar     }
50681a83e813SShri Abhyankar 
50691a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
50701a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
50711a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
50721a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
50731a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
50741a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
50751a83e813SShri Abhyankar 
50761a83e813SShri Abhyankar     /* U part */
50771a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
50781a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
50791a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
50801a83e813SShri Abhyankar     for (j=0; j<nz; j++){
50811a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
50821a83e813SShri Abhyankar     }
50831a83e813SShri Abhyankar   }
50841a83e813SShri Abhyankar 
50851a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
50861a83e813SShri Abhyankar   ierr = PetscFree(v_work);CHKERRQ(ierr);
50871a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
50881a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
50891a83e813SShri Abhyankar 
50901a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
50911a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
50921a83e813SShri Abhyankar   PetscFunctionReturn(0);
50931a83e813SShri Abhyankar }
50941a83e813SShri Abhyankar 
50956bce7ff8SHong Zhang /*
50966bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
509716a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
509816a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
50996bce7ff8SHong Zhang */
51006bce7ff8SHong Zhang #undef __FUNCT__
51016bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
51026bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
51036bce7ff8SHong Zhang {
51046bce7ff8SHong Zhang 
51056bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
51066bce7ff8SHong Zhang   PetscErrorCode     ierr;
510716a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
510816a2bf60SHong Zhang   PetscInt           i,j,nz,*bi,*bj,*bdiag;
51096bce7ff8SHong Zhang 
51106bce7ff8SHong Zhang   PetscFunctionBegin;
511116a2bf60SHong Zhang   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
511216a2bf60SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
51136bce7ff8SHong Zhang   b    = (Mat_SeqBAIJ*)(fact)->data;
511416a2bf60SHong Zhang 
511516a2bf60SHong Zhang   /* allocate matrix arrays for new data structure */
511616a2bf60SHong Zhang   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
511716a2bf60SHong Zhang   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
511816a2bf60SHong Zhang   b->singlemalloc = PETSC_TRUE;
511916a2bf60SHong Zhang   if (!b->diag){
512016a2bf60SHong Zhang     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
512116a2bf60SHong Zhang   }
5122914a18a2SHong Zhang   bdiag = b->diag;
51236bce7ff8SHong Zhang 
512416a2bf60SHong Zhang   if (n > 0) {
512516a2bf60SHong Zhang     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
51266bce7ff8SHong Zhang   }
51276bce7ff8SHong Zhang 
51286bce7ff8SHong Zhang   /* set bi and bj with new data structure */
51296bce7ff8SHong Zhang   bi = b->i;
51306bce7ff8SHong Zhang   bj = b->j;
51316bce7ff8SHong Zhang 
51326bce7ff8SHong Zhang   /* L part */
51336bce7ff8SHong Zhang   bi[0] = 0;
513416a2bf60SHong Zhang   for (i=0; i<n; i++){
51356bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
5136914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
51376bce7ff8SHong Zhang     aj = a->j + ai[i];
51386bce7ff8SHong Zhang     for (j=0; j<nz; j++){
51396bce7ff8SHong Zhang       *bj = aj[j]; bj++;
51406bce7ff8SHong Zhang     }
51416bce7ff8SHong Zhang   }
51426bce7ff8SHong Zhang 
51436bce7ff8SHong Zhang   /* U part */
514416a2bf60SHong Zhang   bi[n+1] = bi[n];
514516a2bf60SHong Zhang   for (i=n-1; i>=0; i--){
51466bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
514716a2bf60SHong Zhang     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
51486bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
51496bce7ff8SHong Zhang     for (j=0; j<nz; j++){
51506bce7ff8SHong Zhang       *bj = aj[j]; bj++;
51516bce7ff8SHong Zhang     }
51526bce7ff8SHong Zhang     /* diag[i] */
51536bce7ff8SHong Zhang     *bj = i; bj++;
515416a2bf60SHong Zhang     bdiag[i] = bi[2*n-i+1]-1;
51556bce7ff8SHong Zhang   }
51566bce7ff8SHong Zhang   PetscFunctionReturn(0);
51576bce7ff8SHong Zhang }
51586bce7ff8SHong Zhang 
515916a2bf60SHong Zhang #undef __FUNCT__
516016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
516116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
516216a2bf60SHong Zhang {
516316a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
516416a2bf60SHong Zhang   IS                 isicol;
516516a2bf60SHong Zhang   PetscErrorCode     ierr;
516616a2bf60SHong Zhang   const PetscInt     *r,*ic;
51677fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
516816a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
516916a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
517016a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
51717fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
517216a2bf60SHong Zhang   PetscReal          f;
517316a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
517416a2bf60SHong Zhang   PetscBT            lnkbt;
517516a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
517616a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
517716a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
517816a2bf60SHong Zhang   PetscTruth         missing;
51797fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
518016a2bf60SHong Zhang 
518116a2bf60SHong Zhang   PetscFunctionBegin;
518216a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
518316a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
518416a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
518516a2bf60SHong Zhang 
518616a2bf60SHong Zhang   f             = info->fill;
518716a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
518816a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
518916a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
519016a2bf60SHong Zhang 
519116a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
519216a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
51937fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
519416a2bf60SHong Zhang 
51957fa3a6a0SHong Zhang   if (!levels && both_identity) {
519616a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
519716a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
519816a2bf60SHong Zhang     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
51997fa3a6a0SHong Zhang     /* set MatSolve routines */
52007fa3a6a0SHong Zhang     switch (bs){
52017fa3a6a0SHong Zhang     case 2:
52027fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
52037fa3a6a0SHong Zhang       break;
52047fa3a6a0SHong Zhang     case 3:
52057fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
52067fa3a6a0SHong Zhang       break;
52077fa3a6a0SHong Zhang     case 4:
52087fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
52097fa3a6a0SHong Zhang       break;
52107fa3a6a0SHong Zhang     case 5:
52117fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
52127fa3a6a0SHong Zhang       break;
52137fa3a6a0SHong Zhang     case 6:
52147fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
52157fa3a6a0SHong Zhang       break;
52167fa3a6a0SHong Zhang     case 7:
52177fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
52187fa3a6a0SHong Zhang       break;
52197fa3a6a0SHong Zhang     default:
52207fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
52217fa3a6a0SHong Zhang       break;
52227fa3a6a0SHong Zhang     }
522316a2bf60SHong Zhang 
522416a2bf60SHong Zhang     fact->factor = MAT_FACTOR_ILU;
522516a2bf60SHong Zhang     (fact)->info.factor_mallocs    = 0;
522616a2bf60SHong Zhang     (fact)->info.fill_ratio_given  = info->fill;
522716a2bf60SHong Zhang     (fact)->info.fill_ratio_needed = 1.0;
522816a2bf60SHong Zhang     b                = (Mat_SeqBAIJ*)(fact)->data;
522916a2bf60SHong Zhang     b->row           = isrow;
523016a2bf60SHong Zhang     b->col           = iscol;
523116a2bf60SHong Zhang     b->icol          = isicol;
523216a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
523316a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
523416a2bf60SHong Zhang     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5235b588c5a2SHong Zhang     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
523616a2bf60SHong Zhang     PetscFunctionReturn(0);
523716a2bf60SHong Zhang   }
523816a2bf60SHong Zhang 
523916a2bf60SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
524016a2bf60SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
524116a2bf60SHong Zhang 
524216a2bf60SHong Zhang   /* get new row pointers */
524316a2bf60SHong Zhang   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
524416a2bf60SHong Zhang   bi[0] = 0;
524516a2bf60SHong Zhang   /* bdiag is location of diagonal in factor */
524616a2bf60SHong Zhang   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
524716a2bf60SHong Zhang   bdiag[0]  = 0;
524816a2bf60SHong Zhang 
524916a2bf60SHong Zhang   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
525016a2bf60SHong Zhang   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
525116a2bf60SHong Zhang 
525216a2bf60SHong Zhang   /* create a linked list for storing column indices of the active row */
525316a2bf60SHong Zhang   nlnk = n + 1;
525416a2bf60SHong Zhang   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
525516a2bf60SHong Zhang 
525616a2bf60SHong Zhang   /* initial FreeSpace size is f*(ai[n]+1) */
525716a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
525816a2bf60SHong Zhang   current_space = free_space;
525916a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
526016a2bf60SHong Zhang   current_space_lvl = free_space_lvl;
526116a2bf60SHong Zhang 
526216a2bf60SHong Zhang   for (i=0; i<n; i++) {
526316a2bf60SHong Zhang     nzi = 0;
526416a2bf60SHong Zhang     /* copy current row into linked list */
526516a2bf60SHong Zhang     nnz  = ai[r[i]+1] - ai[r[i]];
526616a2bf60SHong Zhang     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
526716a2bf60SHong Zhang     cols = aj + ai[r[i]];
526816a2bf60SHong Zhang     lnk[i] = -1; /* marker to indicate if diagonal exists */
526916a2bf60SHong Zhang     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
527016a2bf60SHong Zhang     nzi += nlnk;
527116a2bf60SHong Zhang 
527216a2bf60SHong Zhang     /* make sure diagonal entry is included */
527316a2bf60SHong Zhang     if (diagonal_fill && lnk[i] == -1) {
527416a2bf60SHong Zhang       fm = n;
527516a2bf60SHong Zhang       while (lnk[fm] < i) fm = lnk[fm];
527616a2bf60SHong Zhang       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
527716a2bf60SHong Zhang       lnk[fm]    = i;
527816a2bf60SHong Zhang       lnk_lvl[i] = 0;
527916a2bf60SHong Zhang       nzi++; dcount++;
528016a2bf60SHong Zhang     }
528116a2bf60SHong Zhang 
528216a2bf60SHong Zhang     /* add pivot rows into the active row */
528316a2bf60SHong Zhang     nzbd = 0;
528416a2bf60SHong Zhang     prow = lnk[n];
528516a2bf60SHong Zhang     while (prow < i) {
528616a2bf60SHong Zhang       nnz      = bdiag[prow];
528716a2bf60SHong Zhang       cols     = bj_ptr[prow] + nnz + 1;
528816a2bf60SHong Zhang       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
528916a2bf60SHong Zhang       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
529016a2bf60SHong Zhang       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
529116a2bf60SHong Zhang       nzi += nlnk;
529216a2bf60SHong Zhang       prow = lnk[prow];
529316a2bf60SHong Zhang       nzbd++;
529416a2bf60SHong Zhang     }
529516a2bf60SHong Zhang     bdiag[i] = nzbd;
529616a2bf60SHong Zhang     bi[i+1]  = bi[i] + nzi;
529716a2bf60SHong Zhang 
529816a2bf60SHong Zhang     /* if free space is not available, make more free space */
529916a2bf60SHong Zhang     if (current_space->local_remaining<nzi) {
530016a2bf60SHong Zhang       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
530116a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
530216a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
530316a2bf60SHong Zhang       reallocs++;
530416a2bf60SHong Zhang     }
530516a2bf60SHong Zhang 
530616a2bf60SHong Zhang     /* copy data into free_space and free_space_lvl, then initialize lnk */
530716a2bf60SHong Zhang     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
530816a2bf60SHong Zhang     bj_ptr[i]    = current_space->array;
530916a2bf60SHong Zhang     bjlvl_ptr[i] = current_space_lvl->array;
531016a2bf60SHong Zhang 
531116a2bf60SHong Zhang     /* make sure the active row i has diagonal entry */
531216a2bf60SHong Zhang     if (*(bj_ptr[i]+bdiag[i]) != i) {
531316a2bf60SHong Zhang       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
531416a2bf60SHong Zhang     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
531516a2bf60SHong Zhang     }
531616a2bf60SHong Zhang 
531716a2bf60SHong Zhang     current_space->array           += nzi;
531816a2bf60SHong Zhang     current_space->local_used      += nzi;
531916a2bf60SHong Zhang     current_space->local_remaining -= nzi;
532016a2bf60SHong Zhang     current_space_lvl->array           += nzi;
532116a2bf60SHong Zhang     current_space_lvl->local_used      += nzi;
532216a2bf60SHong Zhang     current_space_lvl->local_remaining -= nzi;
532316a2bf60SHong Zhang   }
532416a2bf60SHong Zhang 
532516a2bf60SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
532616a2bf60SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
532716a2bf60SHong Zhang 
532816a2bf60SHong Zhang   /* destroy list of free space and other temporary arrays */
532916a2bf60SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
533016a2bf60SHong Zhang 
533116a2bf60SHong Zhang   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5332783ef271SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
533316a2bf60SHong Zhang 
533416a2bf60SHong Zhang   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
533516a2bf60SHong Zhang   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
533616a2bf60SHong Zhang   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
533716a2bf60SHong Zhang 
533816a2bf60SHong Zhang #if defined(PETSC_USE_INFO)
533916a2bf60SHong Zhang   {
534016a2bf60SHong Zhang     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
534116a2bf60SHong Zhang     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
534216a2bf60SHong Zhang     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
534316a2bf60SHong Zhang     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
534416a2bf60SHong Zhang     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
534516a2bf60SHong Zhang     if (diagonal_fill) {
534616a2bf60SHong Zhang       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
534716a2bf60SHong Zhang     }
534816a2bf60SHong Zhang   }
534916a2bf60SHong Zhang #endif
535016a2bf60SHong Zhang 
535116a2bf60SHong Zhang   /* put together the new matrix */
535216a2bf60SHong Zhang   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
535316a2bf60SHong Zhang   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
535416a2bf60SHong Zhang   b = (Mat_SeqBAIJ*)(fact)->data;
535516a2bf60SHong Zhang   b->free_a       = PETSC_TRUE;
535616a2bf60SHong Zhang   b->free_ij      = PETSC_TRUE;
535716a2bf60SHong Zhang   b->singlemalloc = PETSC_FALSE;
53587fa3a6a0SHong Zhang   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
535916a2bf60SHong Zhang   b->j          = bj;
536016a2bf60SHong Zhang   b->i          = bi;
536116a2bf60SHong Zhang   b->diag       = bdiag;
53627f53bb6cSHong Zhang   b->free_diag  = PETSC_TRUE;
536316a2bf60SHong Zhang   b->ilen       = 0;
536416a2bf60SHong Zhang   b->imax       = 0;
536516a2bf60SHong Zhang   b->row        = isrow;
536616a2bf60SHong Zhang   b->col        = iscol;
536716a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
536816a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
536916a2bf60SHong Zhang   b->icol       = isicol;
53707fa3a6a0SHong Zhang   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
537116a2bf60SHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
537216a2bf60SHong Zhang      Allocate bdiag, solve_work, new a, new j */
53737fa3a6a0SHong Zhang   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
537416a2bf60SHong Zhang   b->maxnz = b->nz = bi[2*n+1] ;
537516a2bf60SHong Zhang   (fact)->info.factor_mallocs    = reallocs;
537616a2bf60SHong Zhang   (fact)->info.fill_ratio_given  = f;
537716a2bf60SHong Zhang   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
537816a2bf60SHong Zhang   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
53797fa3a6a0SHong Zhang   /* set MatSolve routines */
53807fa3a6a0SHong Zhang   if (both_identity){
53817fa3a6a0SHong Zhang     switch (bs){
53827fa3a6a0SHong Zhang     case 2:
53837fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
53847fa3a6a0SHong Zhang       break;
53857fa3a6a0SHong Zhang     case 3:
53867fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
53877fa3a6a0SHong Zhang       break;
53887fa3a6a0SHong Zhang     case 4:
53897fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
53907fa3a6a0SHong Zhang       break;
53917fa3a6a0SHong Zhang     case 5:
53927fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
53937fa3a6a0SHong Zhang       break;
53947fa3a6a0SHong Zhang     case 6:
53957fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
53967fa3a6a0SHong Zhang       break;
53977fa3a6a0SHong Zhang     case 7:
53987fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
53997fa3a6a0SHong Zhang       break;
54007fa3a6a0SHong Zhang     default:
54017fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
54027fa3a6a0SHong Zhang       break;
54037fa3a6a0SHong Zhang     }
54047fa3a6a0SHong Zhang   } else {
54057fa3a6a0SHong Zhang     switch (bs){
54067fa3a6a0SHong Zhang     case 2:
54077fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
54087fa3a6a0SHong Zhang       break;
54097fa3a6a0SHong Zhang     case 3:
54107fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
54117fa3a6a0SHong Zhang       break;
54127fa3a6a0SHong Zhang     case 4:
54137fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
54147fa3a6a0SHong Zhang       break;
54157fa3a6a0SHong Zhang     case 5:
54167fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
54177fa3a6a0SHong Zhang       break;
54187fa3a6a0SHong Zhang     case 6:
54197fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
54207fa3a6a0SHong Zhang       break;
54217fa3a6a0SHong Zhang     case 7:
54227fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
54237fa3a6a0SHong Zhang       break;
54247fa3a6a0SHong Zhang     default:
54257fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
54267fa3a6a0SHong Zhang       break;
54277fa3a6a0SHong Zhang     }
54287fa3a6a0SHong Zhang   }
542916a2bf60SHong Zhang   PetscFunctionReturn(0);
543016a2bf60SHong Zhang }
543116a2bf60SHong Zhang 
54324e2b4712SSatish Balay /*
54334e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
54344e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
54354e2b4712SSatish Balay    Not a good example of code reuse.
54364e2b4712SSatish Balay */
54374a2ae208SSatish Balay #undef __FUNCT__
54384a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
54390481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
54404e2b4712SSatish Balay {
54414e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
54424e2b4712SSatish Balay   IS             isicol;
54436849ba73SBarry Smith   PetscErrorCode ierr;
54445d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
54455d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5446a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5447d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
544841df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
5449329f5518SBarry Smith   PetscReal      f;
545016a2bf60SHong Zhang   PetscTruth     newdatastruct=PETSC_FALSE;
54514e2b4712SSatish Balay 
54524e2b4712SSatish Balay   PetscFunctionBegin;
545316a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
545416a2bf60SHong Zhang   if (newdatastruct){
545516a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
545616a2bf60SHong Zhang     PetscFunctionReturn(0);
545716a2bf60SHong Zhang   }
545816a2bf60SHong Zhang 
54596bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
54606bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
54616bce7ff8SHong Zhang 
5462435faa5fSBarry Smith   f             = info->fill;
5463690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
5464690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
54654c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
546616a2bf60SHong Zhang 
5467667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5468667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
54697d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
5470309c388cSBarry Smith 
547141df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
547216a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
54736bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
54746bce7ff8SHong Zhang 
5475719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
5476719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
5477bb3d539aSBarry Smith     b->row       = isrow;
5478bb3d539aSBarry Smith     b->col       = iscol;
5479bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5480bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5481bb3d539aSBarry Smith     b->icol      = isicol;
5482bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5483b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
54846bce7ff8SHong Zhang     PetscFunctionReturn(0);
54856bce7ff8SHong Zhang   }
54866bce7ff8SHong Zhang 
54876bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
54884e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
54894e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
54904e2b4712SSatish Balay 
54914e2b4712SSatish Balay     /* get new row pointers */
5492690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
54934e2b4712SSatish Balay     ainew[0] = 0;
54944e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
5495690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
5496690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
54974e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
5498690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
54994e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
5500690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
55014e2b4712SSatish Balay     /* im is level for each filled value */
5502690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
55034e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
5504690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
55054e2b4712SSatish Balay     dloc[0]  = 0;
55064e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
5507435faa5fSBarry Smith 
5508435faa5fSBarry Smith       /* copy prow into linked list */
55094e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
55103b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
55114e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
55124e2b4712SSatish Balay       fill[n]    = n;
5513435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
55144e2b4712SSatish Balay       while (nz--) {
55154e2b4712SSatish Balay 	fm  = n;
55164e2b4712SSatish Balay 	idx = ic[*xi++];
55174e2b4712SSatish Balay 	do {
55184e2b4712SSatish Balay 	  m  = fm;
55194e2b4712SSatish Balay 	  fm = fill[m];
55204e2b4712SSatish Balay 	} while (fm < idx);
55214e2b4712SSatish Balay 	fill[m]   = idx;
55224e2b4712SSatish Balay 	fill[idx] = fm;
55234e2b4712SSatish Balay 	im[idx]   = 0;
55244e2b4712SSatish Balay       }
5525435faa5fSBarry Smith 
5526435faa5fSBarry Smith       /* make sure diagonal entry is included */
5527435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
5528435faa5fSBarry Smith 	fm = n;
5529435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
5530435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5531435faa5fSBarry Smith 	fill[fm]   = prow;
5532435faa5fSBarry Smith 	im[prow]   = 0;
5533435faa5fSBarry Smith 	nzf++;
5534335d9088SBarry Smith 	dcount++;
5535435faa5fSBarry Smith       }
5536435faa5fSBarry Smith 
55374e2b4712SSatish Balay       nzi = 0;
55384e2b4712SSatish Balay       row = fill[n];
55394e2b4712SSatish Balay       while (row < prow) {
55404e2b4712SSatish Balay 	incrlev = im[row] + 1;
55414e2b4712SSatish Balay 	nz      = dloc[row];
5542435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
55434e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
55444e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
55454e2b4712SSatish Balay 	fm      = row;
55464e2b4712SSatish Balay 	while (nnz-- > 0) {
55474e2b4712SSatish Balay 	  idx = *xi++;
55484e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
55494e2b4712SSatish Balay 	    flev++;
55504e2b4712SSatish Balay 	    continue;
55514e2b4712SSatish Balay 	  }
55524e2b4712SSatish Balay 	  do {
55534e2b4712SSatish Balay 	    m  = fm;
55544e2b4712SSatish Balay 	    fm = fill[m];
55554e2b4712SSatish Balay 	  } while (fm < idx);
55564e2b4712SSatish Balay 	  if (fm != idx) {
55574e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
55584e2b4712SSatish Balay 	    fill[m]   = idx;
55594e2b4712SSatish Balay 	    fill[idx] = fm;
55604e2b4712SSatish Balay 	    fm        = idx;
55614e2b4712SSatish Balay 	    nzf++;
5562ecf371e4SBarry Smith 	  } else {
55634e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
55644e2b4712SSatish Balay 	  }
55654e2b4712SSatish Balay 	  flev++;
55664e2b4712SSatish Balay 	}
55674e2b4712SSatish Balay 	row = fill[row];
55684e2b4712SSatish Balay 	nzi++;
55694e2b4712SSatish Balay       }
55704e2b4712SSatish Balay       /* copy new filled row into permanent storage */
55714e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
55724e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
5573ecf371e4SBarry Smith 
5574ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
5575ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5576ecf371e4SBarry Smith 	/* just double the memory each time */
5577690b6cddSBarry Smith 	PetscInt maxadd = jmax;
5578ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
55794e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
55804e2b4712SSatish Balay 	jmax += maxadd;
5581ecf371e4SBarry Smith 
5582ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
55835d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
55845d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5585606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
55865d0c19d7SBarry Smith 	ajnew = xitmp;
55875d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
55885d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5589606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
55905d0c19d7SBarry Smith 	ajfill = xitmp;
5591eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
55924e2b4712SSatish Balay       }
55935d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
55944e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
55954e2b4712SSatish Balay       dloc[prow]  = nzi;
55964e2b4712SSatish Balay       fm          = fill[n];
55974e2b4712SSatish Balay       while (nzf--) {
55985d0c19d7SBarry Smith 	*xitmp++ = fm;
55994e2b4712SSatish Balay 	*flev++ = im[fm];
56004e2b4712SSatish Balay 	fm      = fill[fm];
56014e2b4712SSatish Balay       }
5602435faa5fSBarry Smith       /* make sure row has diagonal entry */
5603435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
560477431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
56052401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5606435faa5fSBarry Smith       }
56074e2b4712SSatish Balay     }
5608606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
56094e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
56104e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5611606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
5612606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
56134e2b4712SSatish Balay 
56146cf91177SBarry Smith #if defined(PETSC_USE_INFO)
56154e2b4712SSatish Balay     {
5616329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5617ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5618ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5619ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5620ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5621335d9088SBarry Smith       if (diagonal_fill) {
5622ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5623335d9088SBarry Smith       }
56244e2b4712SSatish Balay     }
562563ba0a88SBarry Smith #endif
56264e2b4712SSatish Balay 
56274e2b4712SSatish Balay     /* put together the new matrix */
5628719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5629719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5630719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
5631e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
5632e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
56337c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
5634a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
56354e2b4712SSatish Balay     b->j          = ajnew;
56364e2b4712SSatish Balay     b->i          = ainew;
56374e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
56384e2b4712SSatish Balay     b->diag       = dloc;
56397f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
56404e2b4712SSatish Balay     b->ilen       = 0;
56414e2b4712SSatish Balay     b->imax       = 0;
56424e2b4712SSatish Balay     b->row        = isrow;
56434e2b4712SSatish Balay     b->col        = iscol;
5644bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5645c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5646c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5647e51c0b9cSSatish Balay     b->icol       = isicol;
564887828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
56494e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
56504e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
5651719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
56524e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
56534e2b4712SSatish Balay 
5654719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
5655719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
5656719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
56576bce7ff8SHong Zhang 
565841df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
56598661488fSKris Buschelman   PetscFunctionReturn(0);
56608661488fSKris Buschelman }
56618661488fSKris Buschelman 
5662732ee342SKris Buschelman #undef __FUNCT__
56637e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5664dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
56657e7071cdSKris Buschelman {
566612272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
566712272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
56685a9542e3SKris Buschelman   PetscFunctionBegin;
56697cf1b8d3SKris Buschelman   /* Undo Column scaling */
56707cf1b8d3SKris Buschelman /*    while (nz--) { */
56717cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
56727cf1b8d3SKris Buschelman /*    } */
5673c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
5674c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
56757cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
56767cf1b8d3SKris Buschelman }
56777cf1b8d3SKris Buschelman 
56787cf1b8d3SKris Buschelman #undef __FUNCT__
56797cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5680dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
56817cf1b8d3SKris Buschelman {
56827cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5683b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
56842aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
56855a9542e3SKris Buschelman   PetscFunctionBegin;
56860b9da03eSKris Buschelman   /* Is this really necessary? */
568720235379SKris Buschelman   while (nz--) {
56880b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
56897e7071cdSKris Buschelman   }
5690c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
56917e7071cdSKris Buschelman   PetscFunctionReturn(0);
56927e7071cdSKris Buschelman }
56937e7071cdSKris Buschelman 
5694732ee342SKris Buschelman 
5695