xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 35aa4fcf62f8e6f9997ee9ceb96d3f6e04530fe4)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120f1af5d2fSBarry Smith {
121f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122dfbe8321SBarry Smith   PetscErrorCode ierr;
123690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
125f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12787828ca2SBarry Smith   PetscScalar    *x,*b;
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   PetscFunctionBegin;
130ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith   /* forward solve the U^T */
135f1af5d2fSBarry Smith   idx = 0;
136f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
139f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
140ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144f1af5d2fSBarry Smith     v += 9;
145f1af5d2fSBarry Smith 
146f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
147f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
148f1af5d2fSBarry Smith     while (nz--) {
149f1af5d2fSBarry Smith       oidx = 3*(*vi++);
150f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153f1af5d2fSBarry Smith       v  += 9;
154f1af5d2fSBarry Smith     }
155f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156f1af5d2fSBarry Smith     idx += 3;
157f1af5d2fSBarry Smith   }
158f1af5d2fSBarry Smith   /* backward solve the L^T */
159f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
160f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
161f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
162f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
163f1af5d2fSBarry Smith     idt  = 3*i;
164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165f1af5d2fSBarry Smith     while (nz--) {
166f1af5d2fSBarry Smith       idx   = 3*(*vi--);
167f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170f1af5d2fSBarry Smith       v -= 9;
171f1af5d2fSBarry Smith     }
172f1af5d2fSBarry Smith   }
1731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176f1af5d2fSBarry Smith   PetscFunctionReturn(0);
177f1af5d2fSBarry Smith }
178f1af5d2fSBarry Smith 
1794a2ae208SSatish Balay #undef __FUNCT__
1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182f1af5d2fSBarry Smith {
183f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184dfbe8321SBarry Smith   PetscErrorCode ierr;
185690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
187f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18987828ca2SBarry Smith   PetscScalar    *x,*b;
190f1af5d2fSBarry Smith 
191f1af5d2fSBarry Smith   PetscFunctionBegin;
192ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith   /* forward solve the U^T */
197f1af5d2fSBarry Smith   idx = 0;
198f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
199f1af5d2fSBarry Smith 
200f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
201f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
202ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207f1af5d2fSBarry Smith     v += 16;
208f1af5d2fSBarry Smith 
209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
211f1af5d2fSBarry Smith     while (nz--) {
212f1af5d2fSBarry Smith       oidx = 4*(*vi++);
213f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217f1af5d2fSBarry Smith       v  += 16;
218f1af5d2fSBarry Smith     }
219f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220f1af5d2fSBarry Smith     idx += 4;
221f1af5d2fSBarry Smith   }
222f1af5d2fSBarry Smith   /* backward solve the L^T */
223f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
224f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
225f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
226f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
227f1af5d2fSBarry Smith     idt  = 4*i;
228f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229f1af5d2fSBarry Smith     while (nz--) {
230f1af5d2fSBarry Smith       idx   = 4*(*vi--);
231f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235f1af5d2fSBarry Smith       v -= 16;
236f1af5d2fSBarry Smith     }
237f1af5d2fSBarry Smith   }
2381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241f1af5d2fSBarry Smith   PetscFunctionReturn(0);
242f1af5d2fSBarry Smith }
243f1af5d2fSBarry Smith 
2444a2ae208SSatish Balay #undef __FUNCT__
2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247f1af5d2fSBarry Smith {
248f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249dfbe8321SBarry Smith   PetscErrorCode ierr;
250690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
252f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25487828ca2SBarry Smith   PetscScalar    *x,*b;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith   PetscFunctionBegin;
257ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2581ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2591ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith   /* forward solve the U^T */
262f1af5d2fSBarry Smith   idx = 0;
263f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
266f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
267ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273f1af5d2fSBarry Smith     v += 25;
274f1af5d2fSBarry Smith 
275f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
276f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
277f1af5d2fSBarry Smith     while (nz--) {
278f1af5d2fSBarry Smith       oidx = 5*(*vi++);
279f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284f1af5d2fSBarry Smith       v  += 25;
285f1af5d2fSBarry Smith     }
286f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287f1af5d2fSBarry Smith     idx += 5;
288f1af5d2fSBarry Smith   }
289f1af5d2fSBarry Smith   /* backward solve the L^T */
290f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
291f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
292f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
293f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
294f1af5d2fSBarry Smith     idt  = 5*i;
295f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296f1af5d2fSBarry Smith     while (nz--) {
297f1af5d2fSBarry Smith       idx   = 5*(*vi--);
298f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303f1af5d2fSBarry Smith       v -= 25;
304f1af5d2fSBarry Smith     }
305f1af5d2fSBarry Smith   }
3061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309f1af5d2fSBarry Smith   PetscFunctionReturn(0);
310f1af5d2fSBarry Smith }
311f1af5d2fSBarry Smith 
3124a2ae208SSatish Balay #undef __FUNCT__
3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315f1af5d2fSBarry Smith {
316f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317dfbe8321SBarry Smith   PetscErrorCode ierr;
318690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
320f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
32187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32287828ca2SBarry Smith   PetscScalar    *x,*b;
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith   PetscFunctionBegin;
325ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith   /* forward solve the U^T */
330f1af5d2fSBarry Smith   idx = 0;
331f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
332f1af5d2fSBarry Smith 
333f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
334f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
335ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336ef66eb69SBarry Smith     x6    = x[5+idx];
337f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343f1af5d2fSBarry Smith     v += 36;
344f1af5d2fSBarry Smith 
345f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
346f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
347f1af5d2fSBarry Smith     while (nz--) {
348f1af5d2fSBarry Smith       oidx = 6*(*vi++);
349f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355f1af5d2fSBarry Smith       v  += 36;
356f1af5d2fSBarry Smith     }
357f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358f1af5d2fSBarry Smith     x[5+idx] = s6;
359f1af5d2fSBarry Smith     idx += 6;
360f1af5d2fSBarry Smith   }
361f1af5d2fSBarry Smith   /* backward solve the L^T */
362f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
363f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
364f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
365f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
366f1af5d2fSBarry Smith     idt  = 6*i;
367f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368f1af5d2fSBarry Smith     s6 = x[5+idt];
369f1af5d2fSBarry Smith     while (nz--) {
370f1af5d2fSBarry Smith       idx   = 6*(*vi--);
371f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377f1af5d2fSBarry Smith       v -= 36;
378f1af5d2fSBarry Smith     }
379f1af5d2fSBarry Smith   }
3801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383f1af5d2fSBarry Smith   PetscFunctionReturn(0);
384f1af5d2fSBarry Smith }
385f1af5d2fSBarry Smith 
3864a2ae208SSatish Balay #undef __FUNCT__
3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389f1af5d2fSBarry Smith {
390f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391dfbe8321SBarry Smith   PetscErrorCode ierr;
392690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
394f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39687828ca2SBarry Smith   PetscScalar    *x,*b;
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith   PetscFunctionBegin;
399ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith   /* forward solve the U^T */
404f1af5d2fSBarry Smith   idx = 0;
405f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
406f1af5d2fSBarry Smith 
407f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
408f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
409ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
411f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418f1af5d2fSBarry Smith     v += 49;
419f1af5d2fSBarry Smith 
420f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
421f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
422f1af5d2fSBarry Smith     while (nz--) {
423f1af5d2fSBarry Smith       oidx = 7*(*vi++);
424f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431f1af5d2fSBarry Smith       v  += 49;
432f1af5d2fSBarry Smith     }
433f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
435f1af5d2fSBarry Smith     idx += 7;
436f1af5d2fSBarry Smith   }
437f1af5d2fSBarry Smith   /* backward solve the L^T */
438f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
439f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
440f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
441f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
442f1af5d2fSBarry Smith     idt  = 7*i;
443f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
445f1af5d2fSBarry Smith     while (nz--) {
446f1af5d2fSBarry Smith       idx   = 7*(*vi--);
447f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454f1af5d2fSBarry Smith       v -= 49;
455f1af5d2fSBarry Smith     }
456f1af5d2fSBarry Smith   }
4571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460f1af5d2fSBarry Smith   PetscFunctionReturn(0);
461f1af5d2fSBarry Smith }
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4644a2ae208SSatish Balay #undef __FUNCT__
4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467f1af5d2fSBarry Smith {
468f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
4715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473690b6cddSBarry Smith   PetscInt       *diag = a->diag;
474f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47587828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   PetscFunctionBegin;
4781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480f1af5d2fSBarry Smith   t  = a->solve_work;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith     t[i] = b[c[i]];
488f1af5d2fSBarry Smith   }
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith   /* forward solve the U^T */
491f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith     v     = aa + diag[i];
494f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
495f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith     t[i]   = s1;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     s1   = t[i];
509f1af5d2fSBarry Smith     while (nz--) {
510f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   /* copy t into x according to permutation */
515f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
516f1af5d2fSBarry Smith     x[r[i]]   = t[i];
517f1af5d2fSBarry Smith   }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5211ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   PetscFunctionReturn(0);
525f1af5d2fSBarry Smith }
526f1af5d2fSBarry Smith 
5274a2ae208SSatish Balay #undef __FUNCT__
5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530f1af5d2fSBarry Smith {
531f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5336849ba73SBarry Smith   PetscErrorCode ierr;
5345d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53887828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   PetscFunctionBegin;
5421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544f1af5d2fSBarry Smith   t  = a->solve_work;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
550f1af5d2fSBarry Smith   ii = 0;
551f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
552f1af5d2fSBarry Smith     ic      = 2*c[i];
553f1af5d2fSBarry Smith     t[ii]   = b[ic];
554f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
555f1af5d2fSBarry Smith     ii += 2;
556f1af5d2fSBarry Smith   }
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith   /* forward solve the U^T */
559f1af5d2fSBarry Smith   idx = 0;
560f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
561f1af5d2fSBarry Smith 
562f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
563f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
564f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
565f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
566f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
567f1af5d2fSBarry Smith     v += 4;
568f1af5d2fSBarry Smith 
569f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
570f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
571f1af5d2fSBarry Smith     while (nz--) {
572f1af5d2fSBarry Smith       oidx = 2*(*vi++);
573f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575f1af5d2fSBarry Smith       v  += 4;
576f1af5d2fSBarry Smith     }
577f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
578f1af5d2fSBarry Smith     idx += 2;
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith   /* backward solve the L^T */
581f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
582f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
583f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
584f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
585f1af5d2fSBarry Smith     idt  = 2*i;
586f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
587f1af5d2fSBarry Smith     while (nz--) {
588f1af5d2fSBarry Smith       idx   = 2*(*vi--);
589f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591f1af5d2fSBarry Smith       v -= 4;
592f1af5d2fSBarry Smith     }
593f1af5d2fSBarry Smith   }
594f1af5d2fSBarry Smith 
595f1af5d2fSBarry Smith   /* copy t into x according to permutation */
596f1af5d2fSBarry Smith   ii = 0;
597f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
598f1af5d2fSBarry Smith     ir      = 2*r[i];
599f1af5d2fSBarry Smith     x[ir]   = t[ii];
600f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
601f1af5d2fSBarry Smith     ii += 2;
602f1af5d2fSBarry Smith   }
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609f1af5d2fSBarry Smith   PetscFunctionReturn(0);
610f1af5d2fSBarry Smith }
611f1af5d2fSBarry Smith 
6124a2ae208SSatish Balay #undef __FUNCT__
6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615f1af5d2fSBarry Smith {
616f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6186849ba73SBarry Smith   PetscErrorCode ierr;
6195d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   PetscFunctionBegin;
6271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629f1af5d2fSBarry Smith   t  = a->solve_work;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
635f1af5d2fSBarry Smith   ii = 0;
636f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
637f1af5d2fSBarry Smith     ic      = 3*c[i];
638f1af5d2fSBarry Smith     t[ii]   = b[ic];
639f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
640f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
641f1af5d2fSBarry Smith     ii += 3;
642f1af5d2fSBarry Smith   }
643f1af5d2fSBarry Smith 
644f1af5d2fSBarry Smith   /* forward solve the U^T */
645f1af5d2fSBarry Smith   idx = 0;
646f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
647f1af5d2fSBarry Smith 
648f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
649f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
650f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654f1af5d2fSBarry Smith     v += 9;
655f1af5d2fSBarry Smith 
656f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
657f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
658f1af5d2fSBarry Smith     while (nz--) {
659f1af5d2fSBarry Smith       oidx = 3*(*vi++);
660f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663f1af5d2fSBarry Smith       v  += 9;
664f1af5d2fSBarry Smith     }
665f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666f1af5d2fSBarry Smith     idx += 3;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 3*i;
674f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 3*(*vi--);
677f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680f1af5d2fSBarry Smith       v -= 9;
681f1af5d2fSBarry Smith     }
682f1af5d2fSBarry Smith   }
683f1af5d2fSBarry Smith 
684f1af5d2fSBarry Smith   /* copy t into x according to permutation */
685f1af5d2fSBarry Smith   ii = 0;
686f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
687f1af5d2fSBarry Smith     ir      = 3*r[i];
688f1af5d2fSBarry Smith     x[ir]   = t[ii];
689f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
690f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
691f1af5d2fSBarry Smith     ii += 3;
692f1af5d2fSBarry Smith   }
693f1af5d2fSBarry Smith 
694f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699f1af5d2fSBarry Smith   PetscFunctionReturn(0);
700f1af5d2fSBarry Smith }
701f1af5d2fSBarry Smith 
7024a2ae208SSatish Balay #undef __FUNCT__
7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705f1af5d2fSBarry Smith {
706f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7086849ba73SBarry Smith   PetscErrorCode ierr;
7095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   PetscFunctionBegin;
7171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719f1af5d2fSBarry Smith   t  = a->solve_work;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
725f1af5d2fSBarry Smith   ii = 0;
726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
727f1af5d2fSBarry Smith     ic      = 4*c[i];
728f1af5d2fSBarry Smith     t[ii]   = b[ic];
729f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
730f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
731f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
732f1af5d2fSBarry Smith     ii += 4;
733f1af5d2fSBarry Smith   }
734f1af5d2fSBarry Smith 
735f1af5d2fSBarry Smith   /* forward solve the U^T */
736f1af5d2fSBarry Smith   idx = 0;
737f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
738f1af5d2fSBarry Smith 
739f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
740f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
741f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746f1af5d2fSBarry Smith     v += 16;
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
749f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       oidx = 4*(*vi++);
752f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v  += 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759f1af5d2fSBarry Smith     idx += 4;
760f1af5d2fSBarry Smith   }
761f1af5d2fSBarry Smith   /* backward solve the L^T */
762f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
763f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
764f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
765f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
766f1af5d2fSBarry Smith     idt  = 4*i;
767f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768f1af5d2fSBarry Smith     while (nz--) {
769f1af5d2fSBarry Smith       idx   = 4*(*vi--);
770f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774f1af5d2fSBarry Smith       v -= 16;
775f1af5d2fSBarry Smith     }
776f1af5d2fSBarry Smith   }
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   /* copy t into x according to permutation */
779f1af5d2fSBarry Smith   ii = 0;
780f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
781f1af5d2fSBarry Smith     ir      = 4*r[i];
782f1af5d2fSBarry Smith     x[ir]   = t[ii];
783f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
784f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
785f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
786f1af5d2fSBarry Smith     ii += 4;
787f1af5d2fSBarry Smith   }
788f1af5d2fSBarry Smith 
789f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   PetscFunctionReturn(0);
795f1af5d2fSBarry Smith }
796f1af5d2fSBarry Smith 
7974a2ae208SSatish Balay #undef __FUNCT__
7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800f1af5d2fSBarry Smith {
801f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8036849ba73SBarry Smith   PetscErrorCode ierr;
8045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   PetscFunctionBegin;
8121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814f1af5d2fSBarry Smith   t  = a->solve_work;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
820f1af5d2fSBarry Smith   ii = 0;
821f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
822f1af5d2fSBarry Smith     ic      = 5*c[i];
823f1af5d2fSBarry Smith     t[ii]   = b[ic];
824f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
825f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
826f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
827f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
828f1af5d2fSBarry Smith     ii += 5;
829f1af5d2fSBarry Smith   }
830f1af5d2fSBarry Smith 
831f1af5d2fSBarry Smith   /* forward solve the U^T */
832f1af5d2fSBarry Smith   idx = 0;
833f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
834f1af5d2fSBarry Smith 
835f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
836f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
837f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843f1af5d2fSBarry Smith     v += 25;
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
846f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       oidx = 5*(*vi++);
849f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v  += 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857f1af5d2fSBarry Smith     idx += 5;
858f1af5d2fSBarry Smith   }
859f1af5d2fSBarry Smith   /* backward solve the L^T */
860f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
861f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
862f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
863f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
864f1af5d2fSBarry Smith     idt  = 5*i;
865f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866f1af5d2fSBarry Smith     while (nz--) {
867f1af5d2fSBarry Smith       idx   = 5*(*vi--);
868f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873f1af5d2fSBarry Smith       v -= 25;
874f1af5d2fSBarry Smith     }
875f1af5d2fSBarry Smith   }
876f1af5d2fSBarry Smith 
877f1af5d2fSBarry Smith   /* copy t into x according to permutation */
878f1af5d2fSBarry Smith   ii = 0;
879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
880f1af5d2fSBarry Smith     ir      = 5*r[i];
881f1af5d2fSBarry Smith     x[ir]   = t[ii];
882f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
883f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
884f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
885f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
886f1af5d2fSBarry Smith     ii += 5;
887f1af5d2fSBarry Smith   }
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894f1af5d2fSBarry Smith   PetscFunctionReturn(0);
895f1af5d2fSBarry Smith }
896f1af5d2fSBarry Smith 
8974a2ae208SSatish Balay #undef __FUNCT__
8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900f1af5d2fSBarry Smith {
901f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9036849ba73SBarry Smith   PetscErrorCode ierr;
9045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   PetscFunctionBegin;
9121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914f1af5d2fSBarry Smith   t  = a->solve_work;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
920f1af5d2fSBarry Smith   ii = 0;
921f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
922f1af5d2fSBarry Smith     ic      = 6*c[i];
923f1af5d2fSBarry Smith     t[ii]   = b[ic];
924f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
925f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
926f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
927f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
928f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
929f1af5d2fSBarry Smith     ii += 6;
930f1af5d2fSBarry Smith   }
931f1af5d2fSBarry Smith 
932f1af5d2fSBarry Smith   /* forward solve the U^T */
933f1af5d2fSBarry Smith   idx = 0;
934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
937f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
938f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939f1af5d2fSBarry Smith     x6    = t[5+idx];
940f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946f1af5d2fSBarry Smith     v += 36;
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
949f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       oidx = 6*(*vi++);
952f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v  += 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961f1af5d2fSBarry Smith     t[5+idx] = s6;
962f1af5d2fSBarry Smith     idx += 6;
963f1af5d2fSBarry Smith   }
964f1af5d2fSBarry Smith   /* backward solve the L^T */
965f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
966f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
967f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
968f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
969f1af5d2fSBarry Smith     idt  = 6*i;
970f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971f1af5d2fSBarry Smith     s6 = t[5+idt];
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       idx   = 6*(*vi--);
974f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980f1af5d2fSBarry Smith       v -= 36;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith   }
983f1af5d2fSBarry Smith 
984f1af5d2fSBarry Smith   /* copy t into x according to permutation */
985f1af5d2fSBarry Smith   ii = 0;
986f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
987f1af5d2fSBarry Smith     ir      = 6*r[i];
988f1af5d2fSBarry Smith     x[ir]   = t[ii];
989f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
990f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
991f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
992f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
993f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
994f1af5d2fSBarry Smith     ii += 6;
995f1af5d2fSBarry Smith   }
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1003f1af5d2fSBarry Smith }
1004f1af5d2fSBarry Smith 
10054a2ae208SSatish Balay #undef __FUNCT__
10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008f1af5d2fSBarry Smith {
1009f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10116849ba73SBarry Smith   PetscErrorCode ierr;
10125d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   PetscFunctionBegin;
10201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022f1af5d2fSBarry Smith   t  = a->solve_work;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1028f1af5d2fSBarry Smith   ii = 0;
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     ic      = 7*c[i];
1031f1af5d2fSBarry Smith     t[ii]   = b[ic];
1032f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1033f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1034f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1035f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1036f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1037f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1038f1af5d2fSBarry Smith     ii += 7;
1039f1af5d2fSBarry Smith   }
1040f1af5d2fSBarry Smith 
1041f1af5d2fSBarry Smith   /* forward solve the U^T */
1042f1af5d2fSBarry Smith   idx = 0;
1043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1044f1af5d2fSBarry Smith 
1045f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1046f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1047f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1049f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056f1af5d2fSBarry Smith     v += 49;
1057f1af5d2fSBarry Smith 
1058f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1059f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1062f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v  += 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1073f1af5d2fSBarry Smith     idx += 7;
1074f1af5d2fSBarry Smith   }
1075f1af5d2fSBarry Smith   /* backward solve the L^T */
1076f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1077f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1078f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1079f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1080f1af5d2fSBarry Smith     idt  = 7*i;
1081f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1083f1af5d2fSBarry Smith     while (nz--) {
1084f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1085f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092f1af5d2fSBarry Smith       v -= 49;
1093f1af5d2fSBarry Smith     }
1094f1af5d2fSBarry Smith   }
1095f1af5d2fSBarry Smith 
1096f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1097f1af5d2fSBarry Smith   ii = 0;
1098f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1099f1af5d2fSBarry Smith     ir      = 7*r[i];
1100f1af5d2fSBarry Smith     x[ir]   = t[ii];
1101f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1102f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1103f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1104f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1105f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1106f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1107f1af5d2fSBarry Smith     ii += 7;
1108f1af5d2fSBarry Smith   }
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1116f1af5d2fSBarry Smith }
1117f1af5d2fSBarry Smith 
11184e2b4712SSatish Balay /* ----------------------------------------------------------- */
11194a2ae208SSatish Balay #undef __FUNCT__
11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11224e2b4712SSatish Balay {
11234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11256849ba73SBarry Smith   PetscErrorCode ierr;
11265d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11285d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11293f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
113087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11314e2b4712SSatish Balay 
11324e2b4712SSatish Balay   PetscFunctionBegin;
11331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135f1af5d2fSBarry Smith   t  = a->solve_work;
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11394e2b4712SSatish Balay 
11404e2b4712SSatish Balay   /* forward solve the lower triangular */
114187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11424e2b4712SSatish Balay   for (i=1; i<n; i++) {
11434e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11444e2b4712SSatish Balay     vi  = aj + ai[i];
11454e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1146f1af5d2fSBarry Smith     s = t + bs*i;
114787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
11524e2b4712SSatish Balay   }
11534e2b4712SSatish Balay   /* backward solve the upper triangular */
1154d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11564e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11574e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11584e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11604e2b4712SSatish Balay     while (nz--) {
1161f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11624e2b4712SSatish Balay       v += bs2;
11634e2b4712SSatish Balay     }
1164f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11664e2b4712SSatish Balay   }
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11734e2b4712SSatish Balay   PetscFunctionReturn(0);
11744e2b4712SSatish Balay }
11754e2b4712SSatish Balay 
11764a2ae208SSatish Balay #undef __FUNCT__
11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11794e2b4712SSatish Balay {
11804e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11814e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11826849ba73SBarry Smith   PetscErrorCode ierr;
11835d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11845d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11853f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11884e2b4712SSatish Balay 
11894e2b4712SSatish Balay   PetscFunctionBegin;
11901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192f1af5d2fSBarry Smith   t  = a->solve_work;
11934e2b4712SSatish Balay 
11944e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11954e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11964e2b4712SSatish Balay 
11974e2b4712SSatish Balay   /* forward solve the lower triangular */
11984e2b4712SSatish Balay   idx    = 7*(*r++);
1199f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1200f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12024e2b4712SSatish Balay 
12034e2b4712SSatish Balay   for (i=1; i<n; i++) {
12044e2b4712SSatish Balay     v     = aa + 49*ai[i];
12054e2b4712SSatish Balay     vi    = aj + ai[i];
12064e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12074e2b4712SSatish Balay     idx   = 7*(*r++);
1208f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12104e2b4712SSatish Balay     while (nz--) {
12114e2b4712SSatish Balay       idx   = 7*(*vi++);
1212f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1214f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1215f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12224e2b4712SSatish Balay       v += 49;
12234e2b4712SSatish Balay     }
12244e2b4712SSatish Balay     idx = 7*i;
1225f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1226f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12284e2b4712SSatish Balay   }
12294e2b4712SSatish Balay   /* backward solve the upper triangular */
12304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12314e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12324e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12334e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12344e2b4712SSatish Balay     idt  = 7*i;
1235f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1236f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12384e2b4712SSatish Balay     while (nz--) {
12394e2b4712SSatish Balay       idx   = 7*(*vi++);
1240f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1241f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1243f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12504e2b4712SSatish Balay       v += 49;
12514e2b4712SSatish Balay     }
12524e2b4712SSatish Balay     idc = 7*(*c--);
12534e2b4712SSatish Balay     v   = aa + 49*diag[i];
1254f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12684e2b4712SSatish Balay   }
12694e2b4712SSatish Balay 
12704e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12714e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12721ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12754e2b4712SSatish Balay   PetscFunctionReturn(0);
12764e2b4712SSatish Balay }
12774e2b4712SSatish Balay 
12784a2ae208SSatish Balay #undef __FUNCT__
12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
12818f690400SShri Abhyankar {
12828f690400SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
12838f690400SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
12848f690400SShri Abhyankar   PetscErrorCode ierr;
12858f690400SShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
128629b92fc1SShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
12878f690400SShri Abhyankar   MatScalar      *aa=a->a,*v;
12888f690400SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
12898f690400SShri Abhyankar   PetscScalar    *x,*b,*t;
12908f690400SShri Abhyankar 
12918f690400SShri Abhyankar   PetscFunctionBegin;
12928f690400SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12938f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
12948f690400SShri Abhyankar   t  = a->solve_work;
12958f690400SShri Abhyankar 
12968f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
129729b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
12988f690400SShri Abhyankar 
12998f690400SShri Abhyankar   /* forward solve the lower triangular */
130029b92fc1SShri Abhyankar   idx    = 7*r[0];
13018f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
13028f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
13038f690400SShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
13048f690400SShri Abhyankar 
13058f690400SShri Abhyankar   for (i=1; i<n; i++) {
13068f690400SShri Abhyankar     v     = aa + 49*ai[i];
13078f690400SShri Abhyankar     vi    = aj + ai[i];
13088f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
130929b92fc1SShri Abhyankar     idx   = 7*r[i];
13108f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
13118f690400SShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
131229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
131329b92fc1SShri Abhyankar       idx   = 7*vi[m];
13148f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
13158f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
13168f690400SShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
13178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13228f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13238f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13248f690400SShri Abhyankar       v += 49;
13258f690400SShri Abhyankar     }
13268f690400SShri Abhyankar     idx = 7*i;
13278f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
13288f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
13298f690400SShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
13308f690400SShri Abhyankar   }
13318f690400SShri Abhyankar   /* backward solve the upper triangular */
13328f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
13338f690400SShri Abhyankar     k    = 2*n-i;
13348f690400SShri Abhyankar     v    = aa + 49*ai[k];
13358f690400SShri Abhyankar     vi   = aj + ai[k];
13368f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
13378f690400SShri Abhyankar     idt  = 7*i;
13388f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
13398f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
13408f690400SShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
134129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
134229b92fc1SShri Abhyankar       idx   = 7*vi[m];
13438f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
13448f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
13458f690400SShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
13468f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13478f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13488f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13498f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13508f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13518f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13528f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13538f690400SShri Abhyankar       v += 49;
13548f690400SShri Abhyankar     }
135529b92fc1SShri Abhyankar     idc = 7*c[i];
13568f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
13578f690400SShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
13588f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
13598f690400SShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
13608f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
13618f690400SShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
13628f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
13638f690400SShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
13648f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
13658f690400SShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
13668f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
13678f690400SShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
13688f690400SShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
13698f690400SShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13708f690400SShri Abhyankar   }
13718f690400SShri Abhyankar 
13728f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
13738f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13748f690400SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13758f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
13768f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
13778f690400SShri Abhyankar   PetscFunctionReturn(0);
13788f690400SShri Abhyankar }
13798f690400SShri Abhyankar 
13808f690400SShri Abhyankar #undef __FUNCT__
1381*35aa4fcfSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct_v2"
1382*35aa4fcfSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct_v2(Mat A,Vec bb,Vec xx)
1383*35aa4fcfSShri Abhyankar {
1384*35aa4fcfSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1385*35aa4fcfSShri Abhyankar   IS             iscol=a->col,isrow=a->row;
1386*35aa4fcfSShri Abhyankar   PetscErrorCode ierr;
1387*35aa4fcfSShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
1388*35aa4fcfSShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
1389*35aa4fcfSShri Abhyankar   MatScalar      *aa=a->a,*v;
1390*35aa4fcfSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1391*35aa4fcfSShri Abhyankar   PetscScalar    *x,*b,*t;
1392*35aa4fcfSShri Abhyankar 
1393*35aa4fcfSShri Abhyankar   PetscFunctionBegin;
1394*35aa4fcfSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1395*35aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1396*35aa4fcfSShri Abhyankar   t  = a->solve_work;
1397*35aa4fcfSShri Abhyankar 
1398*35aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1399*35aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1400*35aa4fcfSShri Abhyankar 
1401*35aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
1402*35aa4fcfSShri Abhyankar   idx    = 7*r[0];
1403*35aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
1404*35aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1405*35aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
1406*35aa4fcfSShri Abhyankar 
1407*35aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
1408*35aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
1409*35aa4fcfSShri Abhyankar     vi    = aj + ai[i];
1410*35aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
1411*35aa4fcfSShri Abhyankar     idx   = 7*r[i];
1412*35aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1413*35aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1414*35aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
1415*35aa4fcfSShri Abhyankar       idx   = 7*vi[m];
1416*35aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1417*35aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
1418*35aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
1419*35aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1420*35aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1421*35aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1422*35aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1423*35aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1424*35aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1425*35aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1426*35aa4fcfSShri Abhyankar       v += 49;
1427*35aa4fcfSShri Abhyankar     }
1428*35aa4fcfSShri Abhyankar     idx = 7*i;
1429*35aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
1430*35aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1431*35aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
1432*35aa4fcfSShri Abhyankar   }
1433*35aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
1434*35aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
1435*35aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
1436*35aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
1437*35aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
1438*35aa4fcfSShri Abhyankar     idt  = 7*i;
1439*35aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
1440*35aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1441*35aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
1442*35aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
1443*35aa4fcfSShri Abhyankar       idx   = 7*vi[m];
1444*35aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
1445*35aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1446*35aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
1447*35aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1448*35aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1449*35aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1450*35aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1451*35aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1452*35aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1453*35aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1454*35aa4fcfSShri Abhyankar       v += 49;
1455*35aa4fcfSShri Abhyankar     }
1456*35aa4fcfSShri Abhyankar     idc = 7*c[i];
1457*35aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1458*35aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1459*35aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1460*35aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1461*35aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1462*35aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1463*35aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1464*35aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1465*35aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1466*35aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1467*35aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1468*35aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1469*35aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1470*35aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
1471*35aa4fcfSShri Abhyankar   }
1472*35aa4fcfSShri Abhyankar 
1473*35aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1474*35aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1475*35aa4fcfSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1476*35aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1477*35aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1478*35aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
1479*35aa4fcfSShri Abhyankar }
1480*35aa4fcfSShri Abhyankar 
1481*35aa4fcfSShri Abhyankar #undef __FUNCT__
14824a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1483dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
148415091d37SBarry Smith {
148515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1486690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1487dfbe8321SBarry Smith   PetscErrorCode    ierr;
1488690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1489d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1490d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1491d9fead3dSBarry Smith   const PetscScalar *b;
149215091d37SBarry Smith 
149315091d37SBarry Smith   PetscFunctionBegin;
1494d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14951ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
149615091d37SBarry Smith   /* forward solve the lower triangular */
149715091d37SBarry Smith   idx    = 0;
149815091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
149915091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
150015091d37SBarry Smith   x[6] = b[6+idx];
150115091d37SBarry Smith   for (i=1; i<n; i++) {
150215091d37SBarry Smith     v     =  aa + 49*ai[i];
150315091d37SBarry Smith     vi    =  aj + ai[i];
150415091d37SBarry Smith     nz    =  diag[i] - ai[i];
150515091d37SBarry Smith     idx   =  7*i;
1506f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1507f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1508f1af5d2fSBarry Smith     s7  =  b[6+idx];
150915091d37SBarry Smith     while (nz--) {
151015091d37SBarry Smith       jdx   = 7*(*vi++);
151115091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
151215091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
151315091d37SBarry Smith       x7    = x[6+jdx];
1514f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1515f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1516f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1517f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1518f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1519f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1520f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
152115091d37SBarry Smith       v += 49;
152215091d37SBarry Smith      }
1523f1af5d2fSBarry Smith     x[idx]   = s1;
1524f1af5d2fSBarry Smith     x[1+idx] = s2;
1525f1af5d2fSBarry Smith     x[2+idx] = s3;
1526f1af5d2fSBarry Smith     x[3+idx] = s4;
1527f1af5d2fSBarry Smith     x[4+idx] = s5;
1528f1af5d2fSBarry Smith     x[5+idx] = s6;
1529f1af5d2fSBarry Smith     x[6+idx] = s7;
153015091d37SBarry Smith   }
153115091d37SBarry Smith   /* backward solve the upper triangular */
153215091d37SBarry Smith   for (i=n-1; i>=0; i--){
153315091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
153415091d37SBarry Smith     vi   = aj + diag[i] + 1;
153515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
153615091d37SBarry Smith     idt  = 7*i;
1537f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1538f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1539f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1540f1af5d2fSBarry Smith     s7 = x[6+idt];
154115091d37SBarry Smith     while (nz--) {
154215091d37SBarry Smith       idx   = 7*(*vi++);
154315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
154415091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
154515091d37SBarry Smith       x7    = x[6+idx];
1546f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1547f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1548f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1549f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1550f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1551f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1552f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
155315091d37SBarry Smith       v += 49;
155415091d37SBarry Smith     }
155515091d37SBarry Smith     v        = aa + 49*diag[i];
1556f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1557f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1558f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1559f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1560f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1561f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1562f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1563f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1564f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1565f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1566f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1567f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1568f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1569f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
157015091d37SBarry Smith   }
157115091d37SBarry Smith 
1572d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1574dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
157515091d37SBarry Smith   PetscFunctionReturn(0);
157615091d37SBarry Smith }
157715091d37SBarry Smith 
15784a2ae208SSatish Balay #undef __FUNCT__
1579cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1580cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1581cee9d6f2SShri Abhyankar {
1582cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
15836464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1584cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1585cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1586cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1587cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1588cee9d6f2SShri Abhyankar     PetscScalar       *x;
1589cee9d6f2SShri Abhyankar     const PetscScalar *b;
1590cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1591cee9d6f2SShri Abhyankar 
1592cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1593cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1594cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1595cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1596cee9d6f2SShri Abhyankar     idx    = 0;
1597cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1598cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1599cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1600cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1601cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1602cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1603cee9d6f2SShri Abhyankar       idx   = bs*i;
1604cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1605cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
16066464896eSShri Abhyankar        for(k=0;k<nz;k++) {
16076464896eSShri Abhyankar           jdx   = bs*vi[k];
1608cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1609cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1610cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1611cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1612cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1613cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1614cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1615cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1616cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1617cee9d6f2SShri Abhyankar           v   +=  bs2;
1618cee9d6f2SShri Abhyankar         }
1619cee9d6f2SShri Abhyankar 
1620cee9d6f2SShri Abhyankar        x[idx]   = s1;
1621cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1622cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1623cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1624cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1625cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1626cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1627cee9d6f2SShri Abhyankar     }
1628cee9d6f2SShri Abhyankar 
1629cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1630cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1631cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1632cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1633cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1634cee9d6f2SShri Abhyankar      idt = bs*i;
1635cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1636cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
16376464896eSShri Abhyankar     for(k=0;k<nz;k++) {
16386464896eSShri Abhyankar       idx   = bs*vi[k];
1639cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1640cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1641cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1642cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1643cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1644cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1645cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1646cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1647cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1648cee9d6f2SShri Abhyankar         v   +=  bs2;
1649cee9d6f2SShri Abhyankar     }
1650cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1651cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1652cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1653cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1654cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1655cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1656cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1657cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1658cee9d6f2SShri Abhyankar   }
1659cee9d6f2SShri Abhyankar 
1660cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1661cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1662cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1663cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1664cee9d6f2SShri Abhyankar }
1665cee9d6f2SShri Abhyankar 
1666cee9d6f2SShri Abhyankar #undef __FUNCT__
166753cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
166853cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
166953cca76cSShri Abhyankar {
167053cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
167153cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
167253cca76cSShri Abhyankar     PetscErrorCode    ierr;
167353cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
167453cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
167553cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
167653cca76cSShri Abhyankar     PetscScalar       *x;
167753cca76cSShri Abhyankar     const PetscScalar *b;
167853cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
167953cca76cSShri Abhyankar 
168053cca76cSShri Abhyankar     PetscFunctionBegin;
168153cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
168253cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
168353cca76cSShri Abhyankar     /* forward solve the lower triangular */
168453cca76cSShri Abhyankar     idx    = 0;
168553cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
168653cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
168753cca76cSShri Abhyankar     for (i=1; i<n; i++) {
168853cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
168953cca76cSShri Abhyankar        vi   = aj + ai[i];
169053cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
169153cca76cSShri Abhyankar       idx   = bs*i;
169253cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
169353cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
169453cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
169553cca76cSShri Abhyankar           jdx   = bs*vi[k];
169653cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
169753cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
169853cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
169953cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
170053cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
170153cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
170253cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
170353cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
170453cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
170553cca76cSShri Abhyankar           v   +=  bs2;
170653cca76cSShri Abhyankar         }
170753cca76cSShri Abhyankar 
170853cca76cSShri Abhyankar        x[idx]   = s1;
170953cca76cSShri Abhyankar        x[1+idx] = s2;
171053cca76cSShri Abhyankar        x[2+idx] = s3;
171153cca76cSShri Abhyankar        x[3+idx] = s4;
171253cca76cSShri Abhyankar        x[4+idx] = s5;
171353cca76cSShri Abhyankar        x[5+idx] = s6;
171453cca76cSShri Abhyankar        x[6+idx] = s7;
171553cca76cSShri Abhyankar     }
171653cca76cSShri Abhyankar 
171753cca76cSShri Abhyankar    /* backward solve the upper triangular */
171853cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
171953cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
172053cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
172153cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
172253cca76cSShri Abhyankar      idt = bs*i;
172353cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
172453cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
172553cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
172653cca76cSShri Abhyankar       idx   = bs*vi[k];
172753cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
172853cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
172953cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
173053cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
173153cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
173253cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
173353cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
173453cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
173553cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
173653cca76cSShri Abhyankar         v   +=  bs2;
173753cca76cSShri Abhyankar     }
173853cca76cSShri Abhyankar     /* x = inv_diagonal*x */
173953cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
174053cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
174153cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
174253cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
174353cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
174453cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
174553cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
174653cca76cSShri Abhyankar   }
174753cca76cSShri Abhyankar 
174853cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
174953cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175053cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
175153cca76cSShri Abhyankar   PetscFunctionReturn(0);
175253cca76cSShri Abhyankar }
175353cca76cSShri Abhyankar 
175453cca76cSShri Abhyankar #undef __FUNCT__
17554a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1756dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
175715091d37SBarry Smith {
175815091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
175915091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
17606849ba73SBarry Smith   PetscErrorCode    ierr;
17615d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
17625d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1763d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1764d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1765d9fead3dSBarry Smith   const PetscScalar *b;
176615091d37SBarry Smith   PetscFunctionBegin;
1767d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17681ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1769f1af5d2fSBarry Smith   t  = a->solve_work;
177015091d37SBarry Smith 
177115091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
177215091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
177315091d37SBarry Smith 
177415091d37SBarry Smith   /* forward solve the lower triangular */
177515091d37SBarry Smith   idx    = 6*(*r++);
1776f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1777f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1778f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
177915091d37SBarry Smith   for (i=1; i<n; i++) {
178015091d37SBarry Smith     v     = aa + 36*ai[i];
178115091d37SBarry Smith     vi    = aj + ai[i];
178215091d37SBarry Smith     nz    = diag[i] - ai[i];
178315091d37SBarry Smith     idx   = 6*(*r++);
1784f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1785f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
178615091d37SBarry Smith     while (nz--) {
178715091d37SBarry Smith       idx   = 6*(*vi++);
1788f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1789f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1790f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1791f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1792f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1793f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1794f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1795f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
179615091d37SBarry Smith       v += 36;
179715091d37SBarry Smith     }
179815091d37SBarry Smith     idx = 6*i;
1799f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1800f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1801f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
180215091d37SBarry Smith   }
180315091d37SBarry Smith   /* backward solve the upper triangular */
180415091d37SBarry Smith   for (i=n-1; i>=0; i--){
180515091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
180615091d37SBarry Smith     vi   = aj + diag[i] + 1;
180715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
180815091d37SBarry Smith     idt  = 6*i;
1809f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1810f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1811f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
181215091d37SBarry Smith     while (nz--) {
181315091d37SBarry Smith       idx   = 6*(*vi++);
1814f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1815f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1816f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1817f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1818f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1819f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1820f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1821f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1822f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
182315091d37SBarry Smith       v += 36;
182415091d37SBarry Smith     }
182515091d37SBarry Smith     idc = 6*(*c--);
182615091d37SBarry Smith     v   = aa + 36*diag[i];
1827f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1828f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1829f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1830f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1831f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1832f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1833f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1834f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1835f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1836f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1837f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1838f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
183915091d37SBarry Smith   }
184015091d37SBarry Smith 
184115091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
184215091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1843d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1845dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
184615091d37SBarry Smith   PetscFunctionReturn(0);
184715091d37SBarry Smith }
184815091d37SBarry Smith 
18494a2ae208SSatish Balay #undef __FUNCT__
18508f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
18518f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
18528f690400SShri Abhyankar {
18538f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
18548f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
18558f690400SShri Abhyankar   PetscErrorCode    ierr;
18568f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
185729b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
18588f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
18598f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
18608f690400SShri Abhyankar   const PetscScalar *b;
18618f690400SShri Abhyankar   PetscFunctionBegin;
18628f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18638f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
18648f690400SShri Abhyankar   t  = a->solve_work;
18658f690400SShri Abhyankar 
18668f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
186729b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
18688f690400SShri Abhyankar 
18698f690400SShri Abhyankar   /* forward solve the lower triangular */
187029b92fc1SShri Abhyankar   idx    = 6*r[0];
18718f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
18728f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
18738f690400SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
18748f690400SShri Abhyankar   for (i=1; i<n; i++) {
18758f690400SShri Abhyankar     v     = aa + 36*ai[i];
18768f690400SShri Abhyankar     vi    = aj + ai[i];
18778f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
187829b92fc1SShri Abhyankar     idx   = 6*r[i];
18798f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
18808f690400SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
188129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
188229b92fc1SShri Abhyankar       idx   = 6*vi[m];
18838f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
18848f690400SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
18858f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
18868f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
18878f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
18888f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
18898f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
18908f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
18918f690400SShri Abhyankar       v += 36;
18928f690400SShri Abhyankar     }
18938f690400SShri Abhyankar     idx = 6*i;
18948f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
18958f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
18968f690400SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
18978f690400SShri Abhyankar   }
18988f690400SShri Abhyankar   /* backward solve the upper triangular */
18998f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
19008f690400SShri Abhyankar     k    = 2*n-i;
19018f690400SShri Abhyankar     v    = aa + 36*ai[k];
19028f690400SShri Abhyankar     vi   = aj + ai[k];
19038f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
19048f690400SShri Abhyankar     idt  = 6*i;
19058f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
19068f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
19078f690400SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
190829b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
190929b92fc1SShri Abhyankar       idx   = 6*vi[m];
19108f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
19118f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
19128f690400SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
19138f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
19148f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
19158f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
19168f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
19178f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
19188f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
19198f690400SShri Abhyankar       v += 36;
19208f690400SShri Abhyankar     }
192129b92fc1SShri Abhyankar     idc = 6*c[i];
19228f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
19238f690400SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
19248f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
19258f690400SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
19268f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
19278f690400SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
19288f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
19298f690400SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
19308f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
19318f690400SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
19328f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
19338f690400SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
19348f690400SShri Abhyankar   }
19358f690400SShri Abhyankar 
19368f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
19378f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
19388f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19398f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
19408f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
19418f690400SShri Abhyankar   PetscFunctionReturn(0);
19428f690400SShri Abhyankar }
19438f690400SShri Abhyankar 
19446506fda5SShri Abhyankar #undef __FUNCT__
19456506fda5SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct_v2"
19466506fda5SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct_v2(Mat A,Vec bb,Vec xx)
19476506fda5SShri Abhyankar {
19486506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
19496506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
19506506fda5SShri Abhyankar   PetscErrorCode    ierr;
19516506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
19526506fda5SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
19536506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
19546506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
19556506fda5SShri Abhyankar   const PetscScalar *b;
19566506fda5SShri Abhyankar   PetscFunctionBegin;
19576506fda5SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19586506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
19596506fda5SShri Abhyankar   t  = a->solve_work;
19606506fda5SShri Abhyankar 
19616506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
19626506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
19636506fda5SShri Abhyankar 
19646506fda5SShri Abhyankar   /* forward solve the lower triangular */
19656506fda5SShri Abhyankar   idx    = 6*r[0];
19666506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
19676506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
19686506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
19696506fda5SShri Abhyankar   for (i=1; i<n; i++) {
19706506fda5SShri Abhyankar     v     = aa + 36*ai[i];
19716506fda5SShri Abhyankar     vi    = aj + ai[i];
19726506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
19736506fda5SShri Abhyankar     idx   = 6*r[i];
19746506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
19756506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
19766506fda5SShri Abhyankar     for(m=0;m<nz;m++){
19776506fda5SShri Abhyankar       idx   = 6*vi[m];
19786506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
19796506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
19806506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
19816506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
19826506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
19836506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
19846506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
19856506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
19866506fda5SShri Abhyankar       v += 36;
19876506fda5SShri Abhyankar     }
19886506fda5SShri Abhyankar     idx = 6*i;
19896506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
19906506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
19916506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
19926506fda5SShri Abhyankar   }
19936506fda5SShri Abhyankar   /* backward solve the upper triangular */
19946506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
19956506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
19966506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
19976506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
19986506fda5SShri Abhyankar     idt  = 6*i;
19996506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
20006506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
20016506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
20026506fda5SShri Abhyankar     for(m=0;m<nz;m++){
20036506fda5SShri Abhyankar       idx   = 6*vi[m];
20046506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
20056506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
20066506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
20076506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
20086506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
20096506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
20106506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
20116506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
20126506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
20136506fda5SShri Abhyankar       v += 36;
20146506fda5SShri Abhyankar     }
20156506fda5SShri Abhyankar     idc = 6*c[i];
20166506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
20176506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
20186506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
20196506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
20206506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
20216506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
20226506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
20236506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
20246506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
20256506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
20266506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
20276506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
20286506fda5SShri Abhyankar   }
20296506fda5SShri Abhyankar 
20306506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
20316506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20326506fda5SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20336506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
20346506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
20356506fda5SShri Abhyankar   PetscFunctionReturn(0);
20366506fda5SShri Abhyankar }
20378f690400SShri Abhyankar 
20388f690400SShri Abhyankar #undef __FUNCT__
20394a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2040dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
204115091d37SBarry Smith {
204215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2043690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2044dfbe8321SBarry Smith   PetscErrorCode    ierr;
2045690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2046d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2047d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2048d9fead3dSBarry Smith   const PetscScalar *b;
204915091d37SBarry Smith 
205015091d37SBarry Smith   PetscFunctionBegin;
2051d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20521ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
205315091d37SBarry Smith   /* forward solve the lower triangular */
205415091d37SBarry Smith   idx    = 0;
205515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
205615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
205715091d37SBarry Smith   for (i=1; i<n; i++) {
205815091d37SBarry Smith     v     =  aa + 36*ai[i];
205915091d37SBarry Smith     vi    =  aj + ai[i];
206015091d37SBarry Smith     nz    =  diag[i] - ai[i];
206115091d37SBarry Smith     idx   =  6*i;
2062f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2063f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
206415091d37SBarry Smith     while (nz--) {
206515091d37SBarry Smith       jdx   = 6*(*vi++);
206615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
206715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2068f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2069f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2070f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2071f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2072f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2073f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
207415091d37SBarry Smith       v += 36;
207515091d37SBarry Smith      }
2076f1af5d2fSBarry Smith     x[idx]   = s1;
2077f1af5d2fSBarry Smith     x[1+idx] = s2;
2078f1af5d2fSBarry Smith     x[2+idx] = s3;
2079f1af5d2fSBarry Smith     x[3+idx] = s4;
2080f1af5d2fSBarry Smith     x[4+idx] = s5;
2081f1af5d2fSBarry Smith     x[5+idx] = s6;
208215091d37SBarry Smith   }
208315091d37SBarry Smith   /* backward solve the upper triangular */
208415091d37SBarry Smith   for (i=n-1; i>=0; i--){
208515091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
208615091d37SBarry Smith     vi   = aj + diag[i] + 1;
208715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
208815091d37SBarry Smith     idt  = 6*i;
2089f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2090f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2091f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
209215091d37SBarry Smith     while (nz--) {
209315091d37SBarry Smith       idx   = 6*(*vi++);
209415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
209515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2096f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2097f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2098f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2099f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2100f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2101f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
210215091d37SBarry Smith       v += 36;
210315091d37SBarry Smith     }
210415091d37SBarry Smith     v        = aa + 36*diag[i];
2105f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2106f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2107f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2108f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2109f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2110f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
211115091d37SBarry Smith   }
211215091d37SBarry Smith 
2113d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2115dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
211615091d37SBarry Smith   PetscFunctionReturn(0);
211715091d37SBarry Smith }
211815091d37SBarry Smith 
21194a2ae208SSatish Balay #undef __FUNCT__
2120cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2121cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2122cee9d6f2SShri Abhyankar {
2123cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
21246464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2125cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
2126cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
2127cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2128cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2129cee9d6f2SShri Abhyankar     PetscScalar       *x;
2130cee9d6f2SShri Abhyankar     const PetscScalar *b;
2131cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2132cee9d6f2SShri Abhyankar 
2133cee9d6f2SShri Abhyankar     PetscFunctionBegin;
2134cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2135cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2136cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
2137cee9d6f2SShri Abhyankar     idx    = 0;
2138cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2139cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
2140cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
2141cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
2142cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
2143cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
2144cee9d6f2SShri Abhyankar       idx   = bs*i;
2145cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2146cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
21476464896eSShri Abhyankar        for(k=0;k<nz;k++){
21486464896eSShri Abhyankar           jdx   = bs*vi[k];
2149cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2150cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
2151cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2152cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2153cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2154cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2155cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2156cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2157cee9d6f2SShri Abhyankar           v   +=  bs2;
2158cee9d6f2SShri Abhyankar         }
2159cee9d6f2SShri Abhyankar 
2160cee9d6f2SShri Abhyankar        x[idx]   = s1;
2161cee9d6f2SShri Abhyankar        x[1+idx] = s2;
2162cee9d6f2SShri Abhyankar        x[2+idx] = s3;
2163cee9d6f2SShri Abhyankar        x[3+idx] = s4;
2164cee9d6f2SShri Abhyankar        x[4+idx] = s5;
2165cee9d6f2SShri Abhyankar        x[5+idx] = s6;
2166cee9d6f2SShri Abhyankar     }
2167cee9d6f2SShri Abhyankar 
2168cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
2169cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2170cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
2171cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
2172cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2173cee9d6f2SShri Abhyankar      idt = bs*i;
2174cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2175cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
21766464896eSShri Abhyankar      for(k=0;k<nz;k++){
21776464896eSShri Abhyankar       idx   = bs*vi[k];
2178cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2179cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
2180cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
2181cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
2182cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
2183cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
2184cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
2185cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
2186cee9d6f2SShri Abhyankar         v   +=  bs2;
2187cee9d6f2SShri Abhyankar     }
2188cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2189cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2190cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2191cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2192cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2193cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2194cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2195cee9d6f2SShri Abhyankar   }
2196cee9d6f2SShri Abhyankar 
2197cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2198cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2199cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2200cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2201cee9d6f2SShri Abhyankar }
22028f690400SShri Abhyankar 
2203cee9d6f2SShri Abhyankar #undef __FUNCT__
220453cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
220553cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
220653cca76cSShri Abhyankar {
220753cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
220853cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
220953cca76cSShri Abhyankar     PetscErrorCode    ierr;
221053cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
221153cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
221253cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
221353cca76cSShri Abhyankar     PetscScalar       *x;
221453cca76cSShri Abhyankar     const PetscScalar *b;
221553cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
221653cca76cSShri Abhyankar 
221753cca76cSShri Abhyankar     PetscFunctionBegin;
221853cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
221953cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
222053cca76cSShri Abhyankar     /* forward solve the lower triangular */
222153cca76cSShri Abhyankar     idx    = 0;
222253cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
222353cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
222453cca76cSShri Abhyankar     for (i=1; i<n; i++) {
222553cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
222653cca76cSShri Abhyankar        vi   = aj + ai[i];
222753cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
222853cca76cSShri Abhyankar       idx   = bs*i;
222953cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
223053cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
223153cca76cSShri Abhyankar        for(k=0;k<nz;k++){
223253cca76cSShri Abhyankar           jdx   = bs*vi[k];
223353cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
223453cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
223553cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
223653cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
223753cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
223853cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
223953cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
224053cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
224153cca76cSShri Abhyankar           v   +=  bs2;
224253cca76cSShri Abhyankar         }
224353cca76cSShri Abhyankar 
224453cca76cSShri Abhyankar        x[idx]   = s1;
224553cca76cSShri Abhyankar        x[1+idx] = s2;
224653cca76cSShri Abhyankar        x[2+idx] = s3;
224753cca76cSShri Abhyankar        x[3+idx] = s4;
224853cca76cSShri Abhyankar        x[4+idx] = s5;
224953cca76cSShri Abhyankar        x[5+idx] = s6;
225053cca76cSShri Abhyankar     }
225153cca76cSShri Abhyankar 
225253cca76cSShri Abhyankar    /* backward solve the upper triangular */
225353cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
225453cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
225553cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
225653cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
225753cca76cSShri Abhyankar      idt = bs*i;
225853cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
225953cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
226053cca76cSShri Abhyankar      for(k=0;k<nz;k++){
226153cca76cSShri Abhyankar       idx   = bs*vi[k];
226253cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
226353cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
226453cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
226553cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
226653cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
226753cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
226853cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
226953cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
227053cca76cSShri Abhyankar         v   +=  bs2;
227153cca76cSShri Abhyankar     }
227253cca76cSShri Abhyankar     /* x = inv_diagonal*x */
227353cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
227453cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
227553cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
227653cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
227753cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
227853cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
227953cca76cSShri Abhyankar   }
228053cca76cSShri Abhyankar 
228153cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
228253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
228353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
228453cca76cSShri Abhyankar   PetscFunctionReturn(0);
228553cca76cSShri Abhyankar }
228653cca76cSShri Abhyankar 
228753cca76cSShri Abhyankar #undef __FUNCT__
22884a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2289dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
22904e2b4712SSatish Balay {
22914e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22924e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
22936849ba73SBarry Smith   PetscErrorCode    ierr;
22945d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
22955d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2296d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2297d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2298d9fead3dSBarry Smith   const PetscScalar *b;
22994e2b4712SSatish Balay 
23004e2b4712SSatish Balay   PetscFunctionBegin;
2301d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23021ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2303f1af5d2fSBarry Smith   t  = a->solve_work;
23044e2b4712SSatish Balay 
23054e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
23064e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
23074e2b4712SSatish Balay 
23084e2b4712SSatish Balay   /* forward solve the lower triangular */
23094e2b4712SSatish Balay   idx    = 5*(*r++);
2310f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2311f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
23124e2b4712SSatish Balay   for (i=1; i<n; i++) {
23134e2b4712SSatish Balay     v     = aa + 25*ai[i];
23144e2b4712SSatish Balay     vi    = aj + ai[i];
23154e2b4712SSatish Balay     nz    = diag[i] - ai[i];
23164e2b4712SSatish Balay     idx   = 5*(*r++);
2317f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2318f1af5d2fSBarry Smith     s5  = b[4+idx];
23194e2b4712SSatish Balay     while (nz--) {
23204e2b4712SSatish Balay       idx   = 5*(*vi++);
2321f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2322f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2323f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2324f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2325f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2326f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2327f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
23284e2b4712SSatish Balay       v += 25;
23294e2b4712SSatish Balay     }
23304e2b4712SSatish Balay     idx = 5*i;
2331f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2332f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
23334e2b4712SSatish Balay   }
23344e2b4712SSatish Balay   /* backward solve the upper triangular */
23354e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
23364e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
23374e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
23384e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
23394e2b4712SSatish Balay     idt  = 5*i;
2340f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2341f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
23424e2b4712SSatish Balay     while (nz--) {
23434e2b4712SSatish Balay       idx   = 5*(*vi++);
2344f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2345f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2346f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2347f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2348f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2349f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2350f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
23514e2b4712SSatish Balay       v += 25;
23524e2b4712SSatish Balay     }
23534e2b4712SSatish Balay     idc = 5*(*c--);
23544e2b4712SSatish Balay     v   = aa + 25*diag[i];
2355f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2356f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
2357f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2358f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
2359f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2360f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
2361f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2362f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
2363f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2364f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
23654e2b4712SSatish Balay   }
23664e2b4712SSatish Balay 
23674e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23684e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2369d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2371dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
23724e2b4712SSatish Balay   PetscFunctionReturn(0);
23734e2b4712SSatish Balay }
23744e2b4712SSatish Balay 
23754a2ae208SSatish Balay #undef __FUNCT__
23768f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
23778f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
23788f690400SShri Abhyankar {
23798f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
23808f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
23818f690400SShri Abhyankar   PetscErrorCode    ierr;
23828f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
238329b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
23848f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
23858f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
23868f690400SShri Abhyankar   const PetscScalar *b;
23878f690400SShri Abhyankar 
23888f690400SShri Abhyankar   PetscFunctionBegin;
23898f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23908f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23918f690400SShri Abhyankar   t  = a->solve_work;
23928f690400SShri Abhyankar 
23938f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
239429b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
23958f690400SShri Abhyankar 
23968f690400SShri Abhyankar   /* forward solve the lower triangular */
239729b92fc1SShri Abhyankar   idx    = 5*r[0];
23988f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
23998f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
24008f690400SShri Abhyankar   for (i=1; i<n; i++) {
24018f690400SShri Abhyankar     v     = aa + 25*ai[i];
24028f690400SShri Abhyankar     vi    = aj + ai[i];
24038f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
240429b92fc1SShri Abhyankar     idx   = 5*r[i];
24058f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
24068f690400SShri Abhyankar     s5  = b[4+idx];
240729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
240829b92fc1SShri Abhyankar       idx   = 5*vi[m];
24098f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
24108f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
24118f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
24128f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
24138f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
24148f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
24158f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
24168f690400SShri Abhyankar       v += 25;
24178f690400SShri Abhyankar     }
24188f690400SShri Abhyankar     idx = 5*i;
24198f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
24208f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
24218f690400SShri Abhyankar   }
24228f690400SShri Abhyankar   /* backward solve the upper triangular */
24238f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
24248f690400SShri Abhyankar     k    = 2*n-i;
24258f690400SShri Abhyankar     v    = aa + 25*ai[k];
24268f690400SShri Abhyankar     vi   = aj + ai[k];
24278f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
24288f690400SShri Abhyankar     idt  = 5*i;
24298f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
24308f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
243129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
243229b92fc1SShri Abhyankar       idx   = 5*vi[m];
24338f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
24348f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
24358f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
24368f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
24378f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
24388f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
24398f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
24408f690400SShri Abhyankar       v += 25;
24418f690400SShri Abhyankar     }
244229b92fc1SShri Abhyankar     idc = 5*c[i];
24438f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
24448f690400SShri Abhyankar                                  v[15]*s4+v[20]*s5;
24458f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
24468f690400SShri Abhyankar                                  v[16]*s4+v[21]*s5;
24478f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
24488f690400SShri Abhyankar                                  v[17]*s4+v[22]*s5;
24498f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
24508f690400SShri Abhyankar                                  v[18]*s4+v[23]*s5;
24518f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
24528f690400SShri Abhyankar                                  v[19]*s4+v[24]*s5;
24538f690400SShri Abhyankar   }
24548f690400SShri Abhyankar 
24558f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
24568f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
24578f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24588f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24598f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
24608f690400SShri Abhyankar   PetscFunctionReturn(0);
24618f690400SShri Abhyankar }
246278bb4007SShri Abhyankar 
246378bb4007SShri Abhyankar #undef __FUNCT__
246478bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2"
246578bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx)
246678bb4007SShri Abhyankar {
246778bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
246878bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
246978bb4007SShri Abhyankar   PetscErrorCode    ierr;
247078bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
247178bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
247278bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
247378bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
247478bb4007SShri Abhyankar   const PetscScalar *b;
247578bb4007SShri Abhyankar 
247678bb4007SShri Abhyankar   PetscFunctionBegin;
247778bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
247878bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
247978bb4007SShri Abhyankar   t  = a->solve_work;
248078bb4007SShri Abhyankar 
248178bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
248278bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
248378bb4007SShri Abhyankar 
248478bb4007SShri Abhyankar   /* forward solve the lower triangular */
248578bb4007SShri Abhyankar   idx    = 5*r[0];
248678bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
248778bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
248878bb4007SShri Abhyankar   for (i=1; i<n; i++) {
248978bb4007SShri Abhyankar     v     = aa + 25*ai[i];
249078bb4007SShri Abhyankar     vi    = aj + ai[i];
249178bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
249278bb4007SShri Abhyankar     idx   = 5*r[i];
249378bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
249478bb4007SShri Abhyankar     s5  = b[4+idx];
249578bb4007SShri Abhyankar     for(m=0;m<nz;m++){
249678bb4007SShri Abhyankar       idx   = 5*vi[m];
249778bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
249878bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
249978bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
250078bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
250178bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
250278bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
250378bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
250478bb4007SShri Abhyankar       v += 25;
250578bb4007SShri Abhyankar     }
250678bb4007SShri Abhyankar     idx = 5*i;
250778bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
250878bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
250978bb4007SShri Abhyankar   }
251078bb4007SShri Abhyankar   /* backward solve the upper triangular */
251178bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
251278bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
251378bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
251478bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
251578bb4007SShri Abhyankar     idt  = 5*i;
251678bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
251778bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
251878bb4007SShri Abhyankar     for(m=0;m<nz;m++){
251978bb4007SShri Abhyankar       idx   = 5*vi[m];
252078bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
252178bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
252278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
252378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
252478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
252578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
252678bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
252778bb4007SShri Abhyankar       v += 25;
252878bb4007SShri Abhyankar     }
252978bb4007SShri Abhyankar     idc = 5*c[i];
253078bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
253178bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
253278bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
253378bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
253478bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
253578bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
253678bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
253778bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
253878bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
253978bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
254078bb4007SShri Abhyankar   }
254178bb4007SShri Abhyankar 
254278bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
254378bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
254478bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
254578bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
254678bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
254778bb4007SShri Abhyankar   PetscFunctionReturn(0);
254878bb4007SShri Abhyankar }
254978bb4007SShri Abhyankar 
25508f690400SShri Abhyankar #undef __FUNCT__
25514a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2552dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
255315091d37SBarry Smith {
255415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2555690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2556dfbe8321SBarry Smith   PetscErrorCode    ierr;
2557690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2558d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2559d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2560d9fead3dSBarry Smith   const PetscScalar *b;
256115091d37SBarry Smith 
256215091d37SBarry Smith   PetscFunctionBegin;
2563d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
256515091d37SBarry Smith   /* forward solve the lower triangular */
256615091d37SBarry Smith   idx    = 0;
256715091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
256815091d37SBarry Smith   for (i=1; i<n; i++) {
256915091d37SBarry Smith     v     =  aa + 25*ai[i];
257015091d37SBarry Smith     vi    =  aj + ai[i];
257115091d37SBarry Smith     nz    =  diag[i] - ai[i];
257215091d37SBarry Smith     idx   =  5*i;
2573f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
257415091d37SBarry Smith     while (nz--) {
257515091d37SBarry Smith       jdx   = 5*(*vi++);
257615091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2577f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2578f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2579f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2580f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2581f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
258215091d37SBarry Smith       v    += 25;
258315091d37SBarry Smith     }
2584f1af5d2fSBarry Smith     x[idx]   = s1;
2585f1af5d2fSBarry Smith     x[1+idx] = s2;
2586f1af5d2fSBarry Smith     x[2+idx] = s3;
2587f1af5d2fSBarry Smith     x[3+idx] = s4;
2588f1af5d2fSBarry Smith     x[4+idx] = s5;
258915091d37SBarry Smith   }
259015091d37SBarry Smith   /* backward solve the upper triangular */
259115091d37SBarry Smith   for (i=n-1; i>=0; i--){
259215091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
259315091d37SBarry Smith     vi   = aj + diag[i] + 1;
259415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
259515091d37SBarry Smith     idt  = 5*i;
2596f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2597f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
259815091d37SBarry Smith     while (nz--) {
259915091d37SBarry Smith       idx   = 5*(*vi++);
260015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2601f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2602f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2603f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2604f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2605f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
260615091d37SBarry Smith       v    += 25;
260715091d37SBarry Smith     }
260815091d37SBarry Smith     v        = aa + 25*diag[i];
2609f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2610f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2611f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2612f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2613f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
261415091d37SBarry Smith   }
261515091d37SBarry Smith 
2616d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2618dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
261915091d37SBarry Smith   PetscFunctionReturn(0);
262015091d37SBarry Smith }
262115091d37SBarry Smith 
26224a2ae208SSatish Balay #undef __FUNCT__
2623cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2624cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2625cee9d6f2SShri Abhyankar {
2626cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
26276464896eSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2628cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
2629cee9d6f2SShri Abhyankar   PetscInt          jdx;
2630cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2631cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2632cee9d6f2SShri Abhyankar   const PetscScalar *b;
2633cee9d6f2SShri Abhyankar 
2634cee9d6f2SShri Abhyankar   PetscFunctionBegin;
2635cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2636cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2637cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
2638cee9d6f2SShri Abhyankar   idx    = 0;
2639cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2640cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
2641cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
2642cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
2643cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
2644cee9d6f2SShri Abhyankar     idx = 5*i;
2645cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
26466464896eSShri Abhyankar     for(k=0;k<nz;k++) {
26476464896eSShri Abhyankar       jdx   = 5*vi[k];
2648cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2649cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2650cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2651cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2652cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2653cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2654cee9d6f2SShri Abhyankar       v    += 25;
2655cee9d6f2SShri Abhyankar     }
2656cee9d6f2SShri Abhyankar     x[idx]   = s1;
2657cee9d6f2SShri Abhyankar     x[1+idx] = s2;
2658cee9d6f2SShri Abhyankar     x[2+idx] = s3;
2659cee9d6f2SShri Abhyankar     x[3+idx] = s4;
2660cee9d6f2SShri Abhyankar     x[4+idx] = s5;
2661cee9d6f2SShri Abhyankar   }
2662cee9d6f2SShri Abhyankar 
2663cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
2664cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2665cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
2666cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
2667cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2668cee9d6f2SShri Abhyankar     idt = 5*i;
2669cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
2670cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
26716464896eSShri Abhyankar     for(k=0;k<nz;k++){
26726464896eSShri Abhyankar       idx   = 5*vi[k];
2673cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2674cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2675cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2676cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2677cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2678cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2679cee9d6f2SShri Abhyankar       v    += 25;
2680cee9d6f2SShri Abhyankar     }
2681cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2682cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2683cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2684cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2685cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2686cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2687cee9d6f2SShri Abhyankar   }
2688cee9d6f2SShri Abhyankar 
2689cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2690cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2691cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2692cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2693cee9d6f2SShri Abhyankar }
2694cee9d6f2SShri Abhyankar 
2695cee9d6f2SShri Abhyankar #undef __FUNCT__
269653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
269753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
269853cca76cSShri Abhyankar {
269953cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
270053cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
270153cca76cSShri Abhyankar   PetscErrorCode    ierr;
270253cca76cSShri Abhyankar   PetscInt          jdx;
270353cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
270453cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
270553cca76cSShri Abhyankar   const PetscScalar *b;
270653cca76cSShri Abhyankar 
270753cca76cSShri Abhyankar   PetscFunctionBegin;
270853cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
270953cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
271053cca76cSShri Abhyankar   /* forward solve the lower triangular */
271153cca76cSShri Abhyankar   idx    = 0;
271253cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
271353cca76cSShri Abhyankar   for (i=1; i<n; i++) {
271453cca76cSShri Abhyankar     v   = aa + 25*ai[i];
271553cca76cSShri Abhyankar     vi  = aj + ai[i];
271653cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
271753cca76cSShri Abhyankar     idx = 5*i;
271853cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
271953cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
272053cca76cSShri Abhyankar       jdx   = 5*vi[k];
272153cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
272253cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
272353cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
272453cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
272553cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
272653cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
272753cca76cSShri Abhyankar       v    += 25;
272853cca76cSShri Abhyankar     }
272953cca76cSShri Abhyankar     x[idx]   = s1;
273053cca76cSShri Abhyankar     x[1+idx] = s2;
273153cca76cSShri Abhyankar     x[2+idx] = s3;
273253cca76cSShri Abhyankar     x[3+idx] = s4;
273353cca76cSShri Abhyankar     x[4+idx] = s5;
273453cca76cSShri Abhyankar   }
273553cca76cSShri Abhyankar 
273653cca76cSShri Abhyankar   /* backward solve the upper triangular */
273753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
273853cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
273953cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
274053cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
274153cca76cSShri Abhyankar     idt = 5*i;
274253cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
274353cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
274453cca76cSShri Abhyankar     for(k=0;k<nz;k++){
274553cca76cSShri Abhyankar       idx   = 5*vi[k];
274653cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
274753cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
274853cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
274953cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
275053cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
275153cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
275253cca76cSShri Abhyankar       v    += 25;
275353cca76cSShri Abhyankar     }
275453cca76cSShri Abhyankar     /* x = inv_diagonal*x */
275553cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
275653cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
275753cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
275853cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
275953cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
276053cca76cSShri Abhyankar   }
276153cca76cSShri Abhyankar 
276253cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
276353cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
276453cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
276553cca76cSShri Abhyankar   PetscFunctionReturn(0);
276653cca76cSShri Abhyankar }
276753cca76cSShri Abhyankar 
276853cca76cSShri Abhyankar #undef __FUNCT__
27694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2770dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
27714e2b4712SSatish Balay {
27724e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
27734e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
27746849ba73SBarry Smith   PetscErrorCode    ierr;
27755d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
27765d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2777d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2778d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2779d9fead3dSBarry Smith   const PetscScalar *b;
27804e2b4712SSatish Balay 
27814e2b4712SSatish Balay   PetscFunctionBegin;
2782d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2784f1af5d2fSBarry Smith   t  = a->solve_work;
27854e2b4712SSatish Balay 
27864e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
27874e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
27884e2b4712SSatish Balay 
27894e2b4712SSatish Balay   /* forward solve the lower triangular */
27904e2b4712SSatish Balay   idx    = 4*(*r++);
2791f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2792f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
27934e2b4712SSatish Balay   for (i=1; i<n; i++) {
27944e2b4712SSatish Balay     v     = aa + 16*ai[i];
27954e2b4712SSatish Balay     vi    = aj + ai[i];
27964e2b4712SSatish Balay     nz    = diag[i] - ai[i];
27974e2b4712SSatish Balay     idx   = 4*(*r++);
2798f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
27994e2b4712SSatish Balay     while (nz--) {
28004e2b4712SSatish Balay       idx   = 4*(*vi++);
2801f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2802f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2803f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2804f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2805f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
28064e2b4712SSatish Balay       v    += 16;
28074e2b4712SSatish Balay     }
28084e2b4712SSatish Balay     idx        = 4*i;
2809f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2810f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
28114e2b4712SSatish Balay   }
28124e2b4712SSatish Balay   /* backward solve the upper triangular */
28134e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28144e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
28154e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28164e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28174e2b4712SSatish Balay     idt  = 4*i;
2818f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2819f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
28204e2b4712SSatish Balay     while (nz--) {
28214e2b4712SSatish Balay       idx   = 4*(*vi++);
2822f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2823f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2824f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2825f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2826f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2827f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
28284e2b4712SSatish Balay       v += 16;
28294e2b4712SSatish Balay     }
28304e2b4712SSatish Balay     idc      = 4*(*c--);
28314e2b4712SSatish Balay     v        = aa + 16*diag[i];
2832f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2833f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2834f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2835f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
28364e2b4712SSatish Balay   }
28374e2b4712SSatish Balay 
28384e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28394e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2840d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28411ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2842dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
28434e2b4712SSatish Balay   PetscFunctionReturn(0);
28444e2b4712SSatish Balay }
2845f26ec98cSKris Buschelman 
2846f26ec98cSKris Buschelman #undef __FUNCT__
28478f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
28488f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
28498f690400SShri Abhyankar {
28508f690400SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
28518f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
28528f690400SShri Abhyankar   PetscErrorCode    ierr;
285329b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
28548f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
28558f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
28568f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
28578f690400SShri Abhyankar   const PetscScalar *b;
28588f690400SShri Abhyankar 
28598f690400SShri Abhyankar   PetscFunctionBegin;
28608f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28618f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28628f690400SShri Abhyankar   t  = a->solve_work;
28638f690400SShri Abhyankar 
28648f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
286529b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
28668f690400SShri Abhyankar 
28678f690400SShri Abhyankar   /* forward solve the lower triangular */
286829b92fc1SShri Abhyankar   idx    = 4*r[0];
28698f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
28708f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
28718f690400SShri Abhyankar   for (i=1; i<n; i++) {
28728f690400SShri Abhyankar     v     = aa + 16*ai[i];
28738f690400SShri Abhyankar     vi    = aj + ai[i];
28748f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
287529b92fc1SShri Abhyankar     idx   = 4*r[i];
28768f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
287729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
287829b92fc1SShri Abhyankar       idx   = 4*vi[m];
28798f690400SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
28808f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
28818f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
28828f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
28838f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
28848f690400SShri Abhyankar       v    += 16;
28858f690400SShri Abhyankar     }
28868f690400SShri Abhyankar     idx        = 4*i;
28878f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
28888f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
28898f690400SShri Abhyankar   }
28908f690400SShri Abhyankar   /* backward solve the upper triangular */
28918f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
28928f690400SShri Abhyankar     k    = 2*n-i;
28938f690400SShri Abhyankar     v    = aa + 16*ai[k];
28948f690400SShri Abhyankar     vi   = aj + ai[k];
28958f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
28968f690400SShri Abhyankar     idt  = 4*i;
28978f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
28988f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
289929b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
290029b92fc1SShri Abhyankar       idx   = 4*vi[m];
29018f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
29028f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
29038f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
29048f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
29058f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
29068f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
29078f690400SShri Abhyankar       v += 16;
29088f690400SShri Abhyankar     }
290929b92fc1SShri Abhyankar     idc      = 4*c[i];
29108f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
29118f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
29128f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
29138f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
29148f690400SShri Abhyankar   }
29158f690400SShri Abhyankar 
29168f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
29178f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
29188f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29198f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
29208f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
29218f690400SShri Abhyankar   PetscFunctionReturn(0);
29228f690400SShri Abhyankar }
29238f690400SShri Abhyankar 
29248f690400SShri Abhyankar #undef __FUNCT__
292578bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2"
292678bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx)
292778bb4007SShri Abhyankar {
292878bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
292978bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
293078bb4007SShri Abhyankar   PetscErrorCode    ierr;
293178bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
293278bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
293378bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
293478bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
293578bb4007SShri Abhyankar   const PetscScalar *b;
293678bb4007SShri Abhyankar 
293778bb4007SShri Abhyankar   PetscFunctionBegin;
293878bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
293978bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
294078bb4007SShri Abhyankar   t  = a->solve_work;
294178bb4007SShri Abhyankar 
294278bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
294378bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
294478bb4007SShri Abhyankar 
294578bb4007SShri Abhyankar   /* forward solve the lower triangular */
294678bb4007SShri Abhyankar   idx    = 4*r[0];
294778bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
294878bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
294978bb4007SShri Abhyankar   for (i=1; i<n; i++) {
295078bb4007SShri Abhyankar     v     = aa + 16*ai[i];
295178bb4007SShri Abhyankar     vi    = aj + ai[i];
295278bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
295378bb4007SShri Abhyankar     idx   = 4*r[i];
295478bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
295578bb4007SShri Abhyankar     for(m=0;m<nz;m++){
295678bb4007SShri Abhyankar       idx   = 4*vi[m];
295778bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
295878bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
295978bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
296078bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
296178bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
296278bb4007SShri Abhyankar       v    += 16;
296378bb4007SShri Abhyankar     }
296478bb4007SShri Abhyankar     idx        = 4*i;
296578bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
296678bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
296778bb4007SShri Abhyankar   }
296878bb4007SShri Abhyankar   /* backward solve the upper triangular */
296978bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
297078bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
297178bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
297278bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
297378bb4007SShri Abhyankar     idt  = 4*i;
297478bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
297578bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
297678bb4007SShri Abhyankar     for(m=0;m<nz;m++){
297778bb4007SShri Abhyankar       idx   = 4*vi[m];
297878bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
297978bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
298078bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
298178bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
298278bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
298378bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
298478bb4007SShri Abhyankar       v += 16;
298578bb4007SShri Abhyankar     }
298678bb4007SShri Abhyankar     idc      = 4*c[i];
298778bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
298878bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
298978bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
299078bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
299178bb4007SShri Abhyankar   }
299278bb4007SShri Abhyankar 
299378bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
299478bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
299578bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
299678bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
299778bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
299878bb4007SShri Abhyankar   PetscFunctionReturn(0);
299978bb4007SShri Abhyankar }
300078bb4007SShri Abhyankar 
300178bb4007SShri Abhyankar #undef __FUNCT__
3002f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3003dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3004f26ec98cSKris Buschelman {
3005f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3006f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
30076849ba73SBarry Smith   PetscErrorCode    ierr;
30085d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
30095d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3010d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3011d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3012d9fead3dSBarry Smith   PetscScalar       *x;
3013d9fead3dSBarry Smith   const PetscScalar *b;
3014f26ec98cSKris Buschelman 
3015f26ec98cSKris Buschelman   PetscFunctionBegin;
3016d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30171ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3018f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3019f26ec98cSKris Buschelman 
3020f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3021f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3022f26ec98cSKris Buschelman 
3023f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3024f26ec98cSKris Buschelman   idx    = 4*(*r++);
3025f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3026f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3027f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3028f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3029f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3030f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3031f26ec98cSKris Buschelman     vi    = aj + ai[i];
3032f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3033f26ec98cSKris Buschelman     idx   = 4*(*r++);
3034f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3035f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3036f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3037f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3038f26ec98cSKris Buschelman     while (nz--) {
3039f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3040f26ec98cSKris Buschelman       x1  = t[idx];
3041f26ec98cSKris Buschelman       x2  = t[1+idx];
3042f26ec98cSKris Buschelman       x3  = t[2+idx];
3043f26ec98cSKris Buschelman       x4  = t[3+idx];
3044f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3045f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3046f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3047f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3048f26ec98cSKris Buschelman       v    += 16;
3049f26ec98cSKris Buschelman     }
3050f26ec98cSKris Buschelman     idx        = 4*i;
3051f26ec98cSKris Buschelman     t[idx]   = s1;
3052f26ec98cSKris Buschelman     t[1+idx] = s2;
3053f26ec98cSKris Buschelman     t[2+idx] = s3;
3054f26ec98cSKris Buschelman     t[3+idx] = s4;
3055f26ec98cSKris Buschelman   }
3056f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3057f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
3058f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3059f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3060f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3061f26ec98cSKris Buschelman     idt  = 4*i;
3062f26ec98cSKris Buschelman     s1 = t[idt];
3063f26ec98cSKris Buschelman     s2 = t[1+idt];
3064f26ec98cSKris Buschelman     s3 = t[2+idt];
3065f26ec98cSKris Buschelman     s4 = t[3+idt];
3066f26ec98cSKris Buschelman     while (nz--) {
3067f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3068f26ec98cSKris Buschelman       x1  = t[idx];
3069f26ec98cSKris Buschelman       x2  = t[1+idx];
3070f26ec98cSKris Buschelman       x3  = t[2+idx];
3071f26ec98cSKris Buschelman       x4  = t[3+idx];
3072f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3073f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3074f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3075f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3076f26ec98cSKris Buschelman       v += 16;
3077f26ec98cSKris Buschelman     }
3078f26ec98cSKris Buschelman     idc      = 4*(*c--);
3079f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3080f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3081f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3082f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3083f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3084f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3085f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3086f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3087f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3088f26ec98cSKris Buschelman  }
3089f26ec98cSKris Buschelman 
3090f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3091f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3092d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3094dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3095f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3096f26ec98cSKris Buschelman }
3097f26ec98cSKris Buschelman 
309824c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
309924c233c2SKris Buschelman 
310024c233c2SKris Buschelman #include PETSC_HAVE_SSE
310124c233c2SKris Buschelman 
310224c233c2SKris Buschelman #undef __FUNCT__
310324c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3104dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
310524c233c2SKris Buschelman {
310624c233c2SKris Buschelman   /*
310724c233c2SKris Buschelman      Note: This code uses demotion of double
310824c233c2SKris Buschelman      to float when performing the mixed-mode computation.
310924c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
311024c233c2SKris Buschelman   */
311124c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
311224c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
31136849ba73SBarry Smith   PetscErrorCode ierr;
31145d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
31155d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
311624c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
311787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
311824c233c2SKris Buschelman 
311924c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
312024c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
312124c233c2SKris Buschelman   unsigned long   offset;
312224c233c2SKris Buschelman 
312324c233c2SKris Buschelman   PetscFunctionBegin;
312424c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
312524c233c2SKris Buschelman 
312624c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
312724c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
312824c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
312924c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
313024c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
313124c233c2SKris Buschelman 
31321ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
31331ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
313424c233c2SKris Buschelman     t  = a->solve_work;
313524c233c2SKris Buschelman 
313624c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
313724c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
313824c233c2SKris Buschelman 
313924c233c2SKris Buschelman     /* forward solve the lower triangular */
314024c233c2SKris Buschelman     idx  = 4*(*r++);
314124c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
314224c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
314324c233c2SKris Buschelman     v    =  aa + 16*ai[1];
314424c233c2SKris Buschelman 
314524c233c2SKris Buschelman     for (i=1; i<n;) {
314624c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
314724c233c2SKris Buschelman       vi   =  aj      + ai[i];
314824c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
314924c233c2SKris Buschelman       idx  =  4*(*r++);
315024c233c2SKris Buschelman 
315124c233c2SKris Buschelman       /* Demote sum from double to float */
315224c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
315324c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
315424c233c2SKris Buschelman 
315524c233c2SKris Buschelman       while (nz--) {
315624c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
315724c233c2SKris Buschelman         idx = 4*(*vi++);
315824c233c2SKris Buschelman 
315924c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
316024c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
316124c233c2SKris Buschelman 
316224c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
316324c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
316424c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
316524c233c2SKris Buschelman 
316624c233c2SKris Buschelman           /* First Column */
316724c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
316824c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
316924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
317024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
317124c233c2SKris Buschelman 
317224c233c2SKris Buschelman           /* Second Column */
317324c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
317424c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
317524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
317624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
317724c233c2SKris Buschelman 
317824c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
317924c233c2SKris Buschelman 
318024c233c2SKris Buschelman           /* Third Column */
318124c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
318224c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
318324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
318424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
318524c233c2SKris Buschelman 
318624c233c2SKris Buschelman           /* Fourth Column */
318724c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
318824c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
318924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
319024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
319124c233c2SKris Buschelman         SSE_INLINE_END_2
319224c233c2SKris Buschelman 
319324c233c2SKris Buschelman         v  += 16;
319424c233c2SKris Buschelman       }
319524c233c2SKris Buschelman       idx = 4*i;
319624c233c2SKris Buschelman       v   = aa + 16*ai[++i];
319724c233c2SKris Buschelman       PREFETCH_NTA(v);
319824c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
319924c233c2SKris Buschelman 
320024c233c2SKris Buschelman       /* Promote result from float to double */
320124c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
320224c233c2SKris Buschelman     }
320324c233c2SKris Buschelman     /* backward solve the upper triangular */
320424c233c2SKris Buschelman     idt  = 4*(n-1);
320524c233c2SKris Buschelman     ai16 = 16*diag[n-1];
320624c233c2SKris Buschelman     v    = aa + ai16 + 16;
320724c233c2SKris Buschelman     for (i=n-1; i>=0;){
320824c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
320924c233c2SKris Buschelman       vi = aj + diag[i] + 1;
321024c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
321124c233c2SKris Buschelman 
321224c233c2SKris Buschelman       /* Demote accumulator from double to float */
321324c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
321424c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
321524c233c2SKris Buschelman 
321624c233c2SKris Buschelman       while (nz--) {
321724c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
321824c233c2SKris Buschelman         idx = 4*(*vi++);
321924c233c2SKris Buschelman 
322024c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
322124c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
322224c233c2SKris Buschelman 
322324c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
322424c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
322524c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
322624c233c2SKris Buschelman 
322724c233c2SKris Buschelman           /* First Column */
322824c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
322924c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
323024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
323124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
323224c233c2SKris Buschelman 
323324c233c2SKris Buschelman           /* Second Column */
323424c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
323524c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
323624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
323724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
323824c233c2SKris Buschelman 
323924c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
324024c233c2SKris Buschelman 
324124c233c2SKris Buschelman           /* Third Column */
324224c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
324324c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
324424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
324524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
324624c233c2SKris Buschelman 
324724c233c2SKris Buschelman           /* Fourth Column */
324824c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
324924c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
325024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
325124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
325224c233c2SKris Buschelman         SSE_INLINE_END_2
325324c233c2SKris Buschelman         v  += 16;
325424c233c2SKris Buschelman       }
325524c233c2SKris Buschelman       v    = aa + ai16;
325624c233c2SKris Buschelman       ai16 = 16*diag[--i];
325724c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
325824c233c2SKris Buschelman       /*
325924c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
326024c233c2SKris Buschelman          which was inverted as part of the factorization
326124c233c2SKris Buschelman       */
326224c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
326324c233c2SKris Buschelman         /* First Column */
326424c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
326524c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
326624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
326724c233c2SKris Buschelman 
326824c233c2SKris Buschelman         /* Second Column */
326924c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
327024c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
327124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
327224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
327324c233c2SKris Buschelman 
327424c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
327524c233c2SKris Buschelman 
327624c233c2SKris Buschelman         /* Third Column */
327724c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
327824c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
327924c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
328024c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
328124c233c2SKris Buschelman 
328224c233c2SKris Buschelman         /* Fourth Column */
328324c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
328424c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
328524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
328624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
328724c233c2SKris Buschelman 
328824c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
328924c233c2SKris Buschelman       SSE_INLINE_END_3
329024c233c2SKris Buschelman 
329124c233c2SKris Buschelman       /* Promote solution from float to double */
329224c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
329324c233c2SKris Buschelman 
329424c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
329524c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
329624c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
329724c233c2SKris Buschelman       idc  = 4*(*c--);
329824c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
329924c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
330024c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
330124c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
330224c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
330324c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
330424c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
330524c233c2SKris Buschelman       SSE_INLINE_END_2
330624c233c2SKris Buschelman       v    = aa + ai16 + 16;
330724c233c2SKris Buschelman       idt -= 4;
330824c233c2SKris Buschelman     }
330924c233c2SKris Buschelman 
331024c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
331124c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
33121ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
33131ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3314dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
331524c233c2SKris Buschelman   SSE_SCOPE_END;
331624c233c2SKris Buschelman   PetscFunctionReturn(0);
331724c233c2SKris Buschelman }
331824c233c2SKris Buschelman 
331924c233c2SKris Buschelman #endif
33200ef38995SBarry Smith 
33210ef38995SBarry Smith 
33224e2b4712SSatish Balay /*
33234e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
33244e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
33254e2b4712SSatish Balay */
33264a2ae208SSatish Balay #undef __FUNCT__
33274a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3328dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
33294e2b4712SSatish Balay {
33304e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3331356650c2SBarry Smith   PetscInt          n=a->mbs;
3332356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
3333dfbe8321SBarry Smith   PetscErrorCode    ierr;
3334356650c2SBarry Smith   const PetscInt    *diag = a->diag;
3335d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
3336d9fead3dSBarry Smith   PetscScalar       *x;
3337d9fead3dSBarry Smith   const PetscScalar *b;
33384e2b4712SSatish Balay 
33394e2b4712SSatish Balay   PetscFunctionBegin;
3340d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
33424e2b4712SSatish Balay 
3343aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
33442853dc0eSBarry Smith   {
334587828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
33462853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
33472853dc0eSBarry Smith   }
3348aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
33492853dc0eSBarry Smith   {
335087828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
33512853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
33522853dc0eSBarry Smith   }
3353aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
33542853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3355e1293385SBarry Smith #else
335630d4dcafSBarry Smith   {
335787828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3358d9fead3dSBarry Smith     const MatScalar *v;
3359356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
3360356650c2SBarry Smith     const PetscInt  *vi;
3361e1293385SBarry Smith 
33624e2b4712SSatish Balay   /* forward solve the lower triangular */
33634e2b4712SSatish Balay   idx    = 0;
3364e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
33654e2b4712SSatish Balay   for (i=1; i<n; i++) {
33664e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
33674e2b4712SSatish Balay     vi    =  aj      + ai[i];
33684e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3369e1293385SBarry Smith     idx   +=  4;
3370f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
33714e2b4712SSatish Balay     while (nz--) {
33724e2b4712SSatish Balay       jdx   = 4*(*vi++);
33734e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3374f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3375f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3376f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3377f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
33784e2b4712SSatish Balay       v    += 16;
33794e2b4712SSatish Balay     }
3380f1af5d2fSBarry Smith     x[idx]   = s1;
3381f1af5d2fSBarry Smith     x[1+idx] = s2;
3382f1af5d2fSBarry Smith     x[2+idx] = s3;
3383f1af5d2fSBarry Smith     x[3+idx] = s4;
33844e2b4712SSatish Balay   }
33854e2b4712SSatish Balay   /* backward solve the upper triangular */
33864e555682SBarry Smith   idt = 4*(n-1);
33874e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
33884e555682SBarry Smith     ai16 = 16*diag[i];
33894e555682SBarry Smith     v    = aa + ai16 + 16;
33904e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
33914e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3392f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3393f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
33944e2b4712SSatish Balay     while (nz--) {
33954e2b4712SSatish Balay       idx   = 4*(*vi++);
33964e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3397f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3398f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3399f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3400f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
34014e2b4712SSatish Balay       v    += 16;
34024e2b4712SSatish Balay     }
34034e555682SBarry Smith     v        = aa + ai16;
3404f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3405f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3406f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3407f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3408329f5518SBarry Smith     idt -= 4;
34094e2b4712SSatish Balay   }
341030d4dcafSBarry Smith   }
3411e1293385SBarry Smith #endif
34124e2b4712SSatish Balay 
3413d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
34141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3415dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
34164e2b4712SSatish Balay   PetscFunctionReturn(0);
34174e2b4712SSatish Balay }
34184e2b4712SSatish Balay 
3419f26ec98cSKris Buschelman #undef __FUNCT__
3420cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3421cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3422cee9d6f2SShri Abhyankar {
3423cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
34246464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3425cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3426cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3427cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3428cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3429cee9d6f2SShri Abhyankar     PetscScalar       *x;
3430cee9d6f2SShri Abhyankar     const PetscScalar *b;
3431cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3432cee9d6f2SShri Abhyankar 
3433cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3434cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3435cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3436cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3437cee9d6f2SShri Abhyankar     idx    = 0;
3438cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3439cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3440cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3441cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3442cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3443cee9d6f2SShri Abhyankar       idx   = bs*i;
3444cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
34456464896eSShri Abhyankar       for(k=0;k<nz;k++) {
34466464896eSShri Abhyankar           jdx   = bs*vi[k];
3447cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3448cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3449cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3450cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3451cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3452cee9d6f2SShri Abhyankar 
3453cee9d6f2SShri Abhyankar           v   +=  bs2;
3454cee9d6f2SShri Abhyankar         }
3455cee9d6f2SShri Abhyankar 
3456cee9d6f2SShri Abhyankar        x[idx]   = s1;
3457cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3458cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3459cee9d6f2SShri Abhyankar        x[3+idx] = s4;
3460cee9d6f2SShri Abhyankar     }
3461cee9d6f2SShri Abhyankar 
3462cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3463cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3464cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3465cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3466cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3467cee9d6f2SShri Abhyankar      idt = bs*i;
3468cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3469cee9d6f2SShri Abhyankar 
34706464896eSShri Abhyankar     for(k=0;k<nz;k++){
34716464896eSShri Abhyankar       idx   = bs*vi[k];
3472cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3473cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3474cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3475cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3476cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3477cee9d6f2SShri Abhyankar 
3478cee9d6f2SShri Abhyankar         v   +=  bs2;
3479cee9d6f2SShri Abhyankar     }
3480cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3481cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3482cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3483cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3484cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3485cee9d6f2SShri Abhyankar 
3486cee9d6f2SShri Abhyankar   }
3487cee9d6f2SShri Abhyankar 
3488cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3489cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3490cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3491cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3492cee9d6f2SShri Abhyankar }
3493cee9d6f2SShri Abhyankar 
3494b2b2dd24SShri Abhyankar #undef __FUNCT__
3495b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3496b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3497b2b2dd24SShri Abhyankar {
3498b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3499b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3500b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3501b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3502b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3503b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3504b2b2dd24SShri Abhyankar     PetscScalar       *x;
3505b2b2dd24SShri Abhyankar     const PetscScalar *b;
3506b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3507cee9d6f2SShri Abhyankar 
3508b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3509b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3510b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3511b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3512b2b2dd24SShri Abhyankar     idx    = 0;
3513b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3514b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3515b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3516b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3517b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3518b2b2dd24SShri Abhyankar       idx   = bs*i;
3519b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3520b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
3521b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
3522b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3523b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3524b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3525b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3526b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3527b2b2dd24SShri Abhyankar 
3528b2b2dd24SShri Abhyankar           v   +=  bs2;
3529b2b2dd24SShri Abhyankar         }
3530b2b2dd24SShri Abhyankar 
3531b2b2dd24SShri Abhyankar        x[idx]   = s1;
3532b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3533b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3534b2b2dd24SShri Abhyankar        x[3+idx] = s4;
3535b2b2dd24SShri Abhyankar     }
3536b2b2dd24SShri Abhyankar 
3537b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3538b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3539b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3540b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3541b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3542b2b2dd24SShri Abhyankar      idt = bs*i;
3543b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3544b2b2dd24SShri Abhyankar 
3545b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
3546b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
3547b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3548b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3549b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3550b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3551b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3552b2b2dd24SShri Abhyankar 
3553b2b2dd24SShri Abhyankar         v   +=  bs2;
3554b2b2dd24SShri Abhyankar     }
3555b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3556b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3557b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3558b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3559b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3560b2b2dd24SShri Abhyankar 
3561b2b2dd24SShri Abhyankar   }
3562b2b2dd24SShri Abhyankar 
3563b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3564b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3565b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3566b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3567b2b2dd24SShri Abhyankar }
3568cee9d6f2SShri Abhyankar 
3569cee9d6f2SShri Abhyankar #undef __FUNCT__
3570f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3571dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3572f26ec98cSKris Buschelman {
3573f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3574690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3575dfbe8321SBarry Smith   PetscErrorCode ierr;
3576690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3577f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3578f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3579f26ec98cSKris Buschelman 
3580f26ec98cSKris Buschelman   PetscFunctionBegin;
35811ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
35821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3583f26ec98cSKris Buschelman 
3584f26ec98cSKris Buschelman   {
3585f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3586f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3587690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3588f26ec98cSKris Buschelman 
3589f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3590f26ec98cSKris Buschelman     idx  = 0;
3591f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3592f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3593f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3594f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3595f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3596f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3597f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3598f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3599f26ec98cSKris Buschelman       idx   +=  4;
3600f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3601f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3602f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3603f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3604f26ec98cSKris Buschelman       while (nz--) {
3605f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3606f26ec98cSKris Buschelman         x1  = t[jdx];
3607f26ec98cSKris Buschelman         x2  = t[1+jdx];
3608f26ec98cSKris Buschelman         x3  = t[2+jdx];
3609f26ec98cSKris Buschelman         x4  = t[3+jdx];
3610f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3611f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3612f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3613f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3614f26ec98cSKris Buschelman         v    += 16;
3615f26ec98cSKris Buschelman       }
3616f26ec98cSKris Buschelman       t[idx]   = s1;
3617f26ec98cSKris Buschelman       t[1+idx] = s2;
3618f26ec98cSKris Buschelman       t[2+idx] = s3;
3619f26ec98cSKris Buschelman       t[3+idx] = s4;
3620f26ec98cSKris Buschelman     }
3621f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3622f26ec98cSKris Buschelman     idt = 4*(n-1);
3623f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3624f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3625f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3626f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3627f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3628f26ec98cSKris Buschelman       s1   = t[idt];
3629f26ec98cSKris Buschelman       s2   = t[1+idt];
3630f26ec98cSKris Buschelman       s3   = t[2+idt];
3631f26ec98cSKris Buschelman       s4   = t[3+idt];
3632f26ec98cSKris Buschelman       while (nz--) {
3633f26ec98cSKris Buschelman         idx = 4*(*vi++);
3634f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3635f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3636f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3637f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3638f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3639f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3640f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3641f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3642f26ec98cSKris Buschelman         v    += 16;
3643f26ec98cSKris Buschelman       }
3644f26ec98cSKris Buschelman       v        = aa + ai16;
3645f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3646f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3647f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3648f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3649f26ec98cSKris Buschelman       idt -= 4;
3650f26ec98cSKris Buschelman     }
3651f26ec98cSKris Buschelman   }
3652f26ec98cSKris Buschelman 
36531ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
36541ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3655dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3656f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3657f26ec98cSKris Buschelman }
3658f26ec98cSKris Buschelman 
36593660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
36603660e330SKris Buschelman 
36613660e330SKris Buschelman #include PETSC_HAVE_SSE
36623660e330SKris Buschelman #undef __FUNCT__
36637cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3664dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
36653660e330SKris Buschelman {
36663660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
36672aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3668dfbe8321SBarry Smith   PetscErrorCode ierr;
3669dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
36703660e330SKris Buschelman   MatScalar      *aa=a->a;
367187828ca2SBarry Smith   PetscScalar    *x,*b;
36723660e330SKris Buschelman 
36733660e330SKris Buschelman   PetscFunctionBegin;
36743660e330SKris Buschelman   SSE_SCOPE_BEGIN;
36753660e330SKris Buschelman   /*
36763660e330SKris Buschelman      Note: This code currently uses demotion of double
36773660e330SKris Buschelman      to float when performing the mixed-mode computation.
36783660e330SKris Buschelman      This may not be numerically reasonable for all applications.
36793660e330SKris Buschelman   */
36803660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
36813660e330SKris Buschelman 
36821ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
36831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
36843660e330SKris Buschelman   {
3685eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3686eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
36872aa5897fSKris Buschelman     int            nz,i,idt,ai16;
36882aa5897fSKris Buschelman     unsigned int   jdx,idx;
36892aa5897fSKris Buschelman     unsigned short *vi;
3690eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
36913660e330SKris Buschelman 
3692eb05f457SKris Buschelman     /* First block is the identity. */
36933660e330SKris Buschelman     idx  = 0;
3694eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
36952aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
36963660e330SKris Buschelman 
36973660e330SKris Buschelman     for (i=1; i<n;) {
36983660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
36993660e330SKris Buschelman       vi   =  aj      + ai[i];
37003660e330SKris Buschelman       nz   =  diag[i] - ai[i];
37013660e330SKris Buschelman       idx +=  4;
37023660e330SKris Buschelman 
3703eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3704eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3705eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
37063660e330SKris Buschelman 
37073660e330SKris Buschelman       while (nz--) {
37083660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
37092aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
37103660e330SKris Buschelman 
37113660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3712eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
37133660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
37143660e330SKris Buschelman 
37153660e330SKris Buschelman           /* First Column */
37163660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
37173660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
37183660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
37193660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
37203660e330SKris Buschelman 
37213660e330SKris Buschelman           /* Second Column */
37223660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
37233660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
37243660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
37253660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
37263660e330SKris Buschelman 
37273660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
37283660e330SKris Buschelman 
37293660e330SKris Buschelman           /* Third Column */
37303660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
37313660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
37323660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
37333660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
37343660e330SKris Buschelman 
37353660e330SKris Buschelman           /* Fourth Column */
37363660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
37373660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
37383660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
37393660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
37403660e330SKris Buschelman         SSE_INLINE_END_2
37413660e330SKris Buschelman 
37423660e330SKris Buschelman         v  += 16;
37433660e330SKris Buschelman       }
37443660e330SKris Buschelman       v    =  aa + 16*ai[++i];
37453660e330SKris Buschelman       PREFETCH_NTA(v);
3746eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
37473660e330SKris Buschelman     }
3748eb05f457SKris Buschelman 
3749eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3750eb05f457SKris Buschelman 
37513660e330SKris Buschelman     idt  = 4*(n-1);
37523660e330SKris Buschelman     ai16 = 16*diag[n-1];
37533660e330SKris Buschelman     v    = aa + ai16 + 16;
37543660e330SKris Buschelman     for (i=n-1; i>=0;){
37553660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
37563660e330SKris Buschelman       vi = aj + diag[i] + 1;
37573660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
37583660e330SKris Buschelman 
3759eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
37603660e330SKris Buschelman 
37613660e330SKris Buschelman       while (nz--) {
37623660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
37632aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
37643660e330SKris Buschelman 
37653660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3766eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
37673660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
37683660e330SKris Buschelman 
37693660e330SKris Buschelman           /* First Column */
37703660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
37713660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
37723660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
37733660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
37743660e330SKris Buschelman 
37753660e330SKris Buschelman           /* Second Column */
37763660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
37773660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
37783660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
37793660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
37803660e330SKris Buschelman 
37813660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
37823660e330SKris Buschelman 
37833660e330SKris Buschelman           /* Third Column */
37843660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
37853660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
37863660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
37873660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
37883660e330SKris Buschelman 
37893660e330SKris Buschelman           /* Fourth Column */
37903660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
37913660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
37923660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
37933660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
37943660e330SKris Buschelman         SSE_INLINE_END_2
37953660e330SKris Buschelman         v  += 16;
37963660e330SKris Buschelman       }
37973660e330SKris Buschelman       v    = aa + ai16;
37983660e330SKris Buschelman       ai16 = 16*diag[--i];
37993660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
38003660e330SKris Buschelman       /*
38013660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
38023660e330SKris Buschelman          which was inverted as part of the factorization
38033660e330SKris Buschelman       */
3804eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
38053660e330SKris Buschelman         /* First Column */
38063660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
38073660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
38083660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
38093660e330SKris Buschelman 
38103660e330SKris Buschelman         /* Second Column */
38113660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
38123660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
38133660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
38143660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
38153660e330SKris Buschelman 
38163660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
38173660e330SKris Buschelman 
38183660e330SKris Buschelman         /* Third Column */
38193660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
38203660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
38213660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
38223660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
38233660e330SKris Buschelman 
38243660e330SKris Buschelman         /* Fourth Column */
38253660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
38263660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
38273660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
38283660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
38293660e330SKris Buschelman 
38303660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
38313660e330SKris Buschelman       SSE_INLINE_END_3
38323660e330SKris Buschelman 
38333660e330SKris Buschelman       v    = aa + ai16 + 16;
38343660e330SKris Buschelman       idt -= 4;
38353660e330SKris Buschelman     }
3836eb05f457SKris Buschelman 
3837eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3838eb05f457SKris Buschelman     idt = 4*(n-1);
3839eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3840eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3841eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3842eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3843eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3844eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3845eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3846eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3847eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
384854693613SKris Buschelman       idt -= 4;
38493660e330SKris Buschelman     }
3850eb05f457SKris Buschelman 
3851eb05f457SKris Buschelman   } /* End of artificial scope. */
38521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
38531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3854dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
38553660e330SKris Buschelman   SSE_SCOPE_END;
38563660e330SKris Buschelman   PetscFunctionReturn(0);
38573660e330SKris Buschelman }
38583660e330SKris Buschelman 
38597cf1b8d3SKris Buschelman #undef __FUNCT__
38607cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3861dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
38627cf1b8d3SKris Buschelman {
38637cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
38647cf1b8d3SKris Buschelman   int            *aj=a->j;
3865dfbe8321SBarry Smith   PetscErrorCode ierr;
3866dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
38677cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
38687cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
38697cf1b8d3SKris Buschelman 
38707cf1b8d3SKris Buschelman   PetscFunctionBegin;
38717cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
38727cf1b8d3SKris Buschelman   /*
38737cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
38747cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
38757cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
38767cf1b8d3SKris Buschelman   */
38777cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
38787cf1b8d3SKris Buschelman 
38791ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
38801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
38817cf1b8d3SKris Buschelman   {
38827cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
38837cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
38847cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
38857cf1b8d3SKris Buschelman     int       jdx,idx;
38867cf1b8d3SKris Buschelman     int       *vi;
38877cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
38887cf1b8d3SKris Buschelman 
38897cf1b8d3SKris Buschelman     /* First block is the identity. */
38907cf1b8d3SKris Buschelman     idx  = 0;
38917cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
38927cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
38937cf1b8d3SKris Buschelman 
38947cf1b8d3SKris Buschelman     for (i=1; i<n;) {
38957cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
38967cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
38977cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
38987cf1b8d3SKris Buschelman       idx +=  4;
38997cf1b8d3SKris Buschelman 
39007cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
39017cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
39027cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
39037cf1b8d3SKris Buschelman 
39047cf1b8d3SKris Buschelman       while (nz--) {
39057cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
39067cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
39077cf1b8d3SKris Buschelman /*          jdx = *vi++; */
39087cf1b8d3SKris Buschelman 
39097cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
39107cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
39117cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
39127cf1b8d3SKris Buschelman 
39137cf1b8d3SKris Buschelman           /* First Column */
39147cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
39157cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
39167cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
39177cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
39187cf1b8d3SKris Buschelman 
39197cf1b8d3SKris Buschelman           /* Second Column */
39207cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
39217cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
39227cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
39237cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
39247cf1b8d3SKris Buschelman 
39257cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
39267cf1b8d3SKris Buschelman 
39277cf1b8d3SKris Buschelman           /* Third Column */
39287cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
39297cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
39307cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
39317cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
39327cf1b8d3SKris Buschelman 
39337cf1b8d3SKris Buschelman           /* Fourth Column */
39347cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
39357cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
39367cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
39377cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
39387cf1b8d3SKris Buschelman         SSE_INLINE_END_2
39397cf1b8d3SKris Buschelman 
39407cf1b8d3SKris Buschelman         v  += 16;
39417cf1b8d3SKris Buschelman       }
39427cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
39437cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
39447cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
39457cf1b8d3SKris Buschelman     }
39467cf1b8d3SKris Buschelman 
39477cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
39487cf1b8d3SKris Buschelman 
39497cf1b8d3SKris Buschelman     idt  = 4*(n-1);
39507cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
39517cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
39527cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
39537cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
39547cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
39557cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
39567cf1b8d3SKris Buschelman 
39577cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
39587cf1b8d3SKris Buschelman 
39597cf1b8d3SKris Buschelman       while (nz--) {
39607cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
39617cf1b8d3SKris Buschelman         idx = 4*(*vi++);
39627cf1b8d3SKris Buschelman /*          idx = *vi++; */
39637cf1b8d3SKris Buschelman 
39647cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
39657cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
39667cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
39677cf1b8d3SKris Buschelman 
39687cf1b8d3SKris Buschelman           /* First Column */
39697cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
39707cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
39717cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
39727cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
39737cf1b8d3SKris Buschelman 
39747cf1b8d3SKris Buschelman           /* Second Column */
39757cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
39767cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
39777cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
39787cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
39797cf1b8d3SKris Buschelman 
39807cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
39817cf1b8d3SKris Buschelman 
39827cf1b8d3SKris Buschelman           /* Third Column */
39837cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
39847cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
39857cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
39867cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
39877cf1b8d3SKris Buschelman 
39887cf1b8d3SKris Buschelman           /* Fourth Column */
39897cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
39907cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
39917cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
39927cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
39937cf1b8d3SKris Buschelman         SSE_INLINE_END_2
39947cf1b8d3SKris Buschelman         v  += 16;
39957cf1b8d3SKris Buschelman       }
39967cf1b8d3SKris Buschelman       v    = aa + ai16;
39977cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
39987cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
39997cf1b8d3SKris Buschelman       /*
40007cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
40017cf1b8d3SKris Buschelman          which was inverted as part of the factorization
40027cf1b8d3SKris Buschelman       */
40037cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
40047cf1b8d3SKris Buschelman         /* First Column */
40057cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
40067cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
40077cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
40087cf1b8d3SKris Buschelman 
40097cf1b8d3SKris Buschelman         /* Second Column */
40107cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
40117cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
40127cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
40137cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
40147cf1b8d3SKris Buschelman 
40157cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
40167cf1b8d3SKris Buschelman 
40177cf1b8d3SKris Buschelman         /* Third Column */
40187cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
40197cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
40207cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
40217cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
40227cf1b8d3SKris Buschelman 
40237cf1b8d3SKris Buschelman         /* Fourth Column */
40247cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
40257cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
40267cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
40277cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
40287cf1b8d3SKris Buschelman 
40297cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
40307cf1b8d3SKris Buschelman       SSE_INLINE_END_3
40317cf1b8d3SKris Buschelman 
40327cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
40337cf1b8d3SKris Buschelman       idt -= 4;
40347cf1b8d3SKris Buschelman     }
40357cf1b8d3SKris Buschelman 
40367cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
40377cf1b8d3SKris Buschelman     idt = 4*(n-1);
40387cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
40397cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
40407cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
40417cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
40427cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
40437cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
40447cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
40457cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
40467cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
40477cf1b8d3SKris Buschelman       idt -= 4;
40487cf1b8d3SKris Buschelman     }
40497cf1b8d3SKris Buschelman 
40507cf1b8d3SKris Buschelman   } /* End of artificial scope. */
40511ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
40521ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4053dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
40547cf1b8d3SKris Buschelman   SSE_SCOPE_END;
40557cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
40567cf1b8d3SKris Buschelman }
40577cf1b8d3SKris Buschelman 
40583660e330SKris Buschelman #endif
40598f690400SShri Abhyankar 
40604a2ae208SSatish Balay #undef __FUNCT__
40614a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
4062dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
40634e2b4712SSatish Balay {
40644e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
40654e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
40666849ba73SBarry Smith   PetscErrorCode    ierr;
40675d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
40685d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4069d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4070d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4071d9fead3dSBarry Smith   const PetscScalar *b;
40724e2b4712SSatish Balay 
40734e2b4712SSatish Balay   PetscFunctionBegin;
4074d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40751ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4076f1af5d2fSBarry Smith   t  = a->solve_work;
40774e2b4712SSatish Balay 
40784e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
40794e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
40804e2b4712SSatish Balay 
40814e2b4712SSatish Balay   /* forward solve the lower triangular */
40824e2b4712SSatish Balay   idx    = 3*(*r++);
4083f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
40844e2b4712SSatish Balay   for (i=1; i<n; i++) {
40854e2b4712SSatish Balay     v     = aa + 9*ai[i];
40864e2b4712SSatish Balay     vi    = aj + ai[i];
40874e2b4712SSatish Balay     nz    = diag[i] - ai[i];
40884e2b4712SSatish Balay     idx   = 3*(*r++);
4089f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
40904e2b4712SSatish Balay     while (nz--) {
40914e2b4712SSatish Balay       idx   = 3*(*vi++);
4092f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4093f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4094f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4095f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40964e2b4712SSatish Balay       v += 9;
40974e2b4712SSatish Balay     }
40984e2b4712SSatish Balay     idx = 3*i;
4099f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
41004e2b4712SSatish Balay   }
41014e2b4712SSatish Balay   /* backward solve the upper triangular */
41024e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
41034e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
41044e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
41054e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
41064e2b4712SSatish Balay     idt  = 3*i;
4107f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
41084e2b4712SSatish Balay     while (nz--) {
41094e2b4712SSatish Balay       idx   = 3*(*vi++);
4110f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4111f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4112f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4113f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
41144e2b4712SSatish Balay       v += 9;
41154e2b4712SSatish Balay     }
41164e2b4712SSatish Balay     idc = 3*(*c--);
41174e2b4712SSatish Balay     v   = aa + 9*diag[i];
4118f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4119f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4120f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
41214e2b4712SSatish Balay   }
41224e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
41234e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4124d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41251ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4126dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
41274e2b4712SSatish Balay   PetscFunctionReturn(0);
41284e2b4712SSatish Balay }
41294e2b4712SSatish Balay 
41308f690400SShri Abhyankar #undef __FUNCT__
41318f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
41328f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
41338f690400SShri Abhyankar {
41348f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
41358f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
41368f690400SShri Abhyankar   PetscErrorCode    ierr;
413729b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
41388f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
41398f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
41408f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
41418f690400SShri Abhyankar   const PetscScalar *b;
41428f690400SShri Abhyankar 
41438f690400SShri Abhyankar   PetscFunctionBegin;
41448f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41458f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
41468f690400SShri Abhyankar   t  = a->solve_work;
41478f690400SShri Abhyankar 
41488f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
414929b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
41508f690400SShri Abhyankar 
41518f690400SShri Abhyankar   /* forward solve the lower triangular */
415229b92fc1SShri Abhyankar   idx    = 3*r[0];
41538f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
41548f690400SShri Abhyankar   for (i=1; i<n; i++) {
41558f690400SShri Abhyankar     v     = aa + 9*ai[i];
41568f690400SShri Abhyankar     vi    = aj + ai[i];
41578f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
415829b92fc1SShri Abhyankar     idx   = 3*r[i];
41598f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
416029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
416129b92fc1SShri Abhyankar       idx   = 3*vi[m];
41628f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
41638f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
41648f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
41658f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
41668f690400SShri Abhyankar       v += 9;
41678f690400SShri Abhyankar     }
41688f690400SShri Abhyankar     idx = 3*i;
41698f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
41708f690400SShri Abhyankar   }
41718f690400SShri Abhyankar   /* backward solve the upper triangular */
41728f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
41738f690400SShri Abhyankar     k    = 2*n-i;
41748f690400SShri Abhyankar     v    = aa + 9*ai[k];
41758f690400SShri Abhyankar     vi   = aj + ai[k];
41768f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
41778f690400SShri Abhyankar     idt  = 3*i;
41788f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
417929b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
418029b92fc1SShri Abhyankar       idx   = 3*vi[m];
41818f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
41828f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
41838f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
41848f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
41858f690400SShri Abhyankar       v += 9;
41868f690400SShri Abhyankar     }
418729b92fc1SShri Abhyankar     idc = 3*c[i];
41888f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
41898f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
41908f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
41918f690400SShri Abhyankar   }
41928f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
41938f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
41948f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41958f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
41968f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
41978f690400SShri Abhyankar   PetscFunctionReturn(0);
41988f690400SShri Abhyankar }
41998f690400SShri Abhyankar 
42000c4413a7SShri Abhyankar #undef __FUNCT__
42010c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
42020c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
42030c4413a7SShri Abhyankar {
42040c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
42050c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
42060c4413a7SShri Abhyankar   PetscErrorCode    ierr;
42070c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
42080c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
42090c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
42100c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
42110c4413a7SShri Abhyankar   const PetscScalar *b;
42120c4413a7SShri Abhyankar 
42130c4413a7SShri Abhyankar   PetscFunctionBegin;
42140c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42150c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
42160c4413a7SShri Abhyankar   t  = a->solve_work;
42170c4413a7SShri Abhyankar 
42180c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
42190c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
42200c4413a7SShri Abhyankar 
42210c4413a7SShri Abhyankar   /* forward solve the lower triangular */
42220c4413a7SShri Abhyankar   idx    = 3*r[0];
42230c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
42240c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
42250c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
42260c4413a7SShri Abhyankar     vi    = aj + ai[i];
42270c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
42280c4413a7SShri Abhyankar     idx   = 3*r[i];
42290c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
42300c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
42310c4413a7SShri Abhyankar       idx   = 3*vi[m];
42320c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
42330c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
42340c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
42350c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
42360c4413a7SShri Abhyankar       v += 9;
42370c4413a7SShri Abhyankar     }
42380c4413a7SShri Abhyankar     idx = 3*i;
42390c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
42400c4413a7SShri Abhyankar   }
42410c4413a7SShri Abhyankar   /* backward solve the upper triangular */
42420c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
42430c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
42440c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
42450c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
42460c4413a7SShri Abhyankar     idt  = 3*i;
42470c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
42480c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
42490c4413a7SShri Abhyankar       idx   = 3*vi[m];
42500c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
42510c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
42520c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
42530c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
42540c4413a7SShri Abhyankar       v += 9;
42550c4413a7SShri Abhyankar     }
42560c4413a7SShri Abhyankar     idc = 3*c[i];
42570c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
42580c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
42590c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
42600c4413a7SShri Abhyankar   }
42610c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
42620c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
42630c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42640c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
42650c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
42660c4413a7SShri Abhyankar   PetscFunctionReturn(0);
42670c4413a7SShri Abhyankar }
42680c4413a7SShri Abhyankar 
426915091d37SBarry Smith /*
427015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
427115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
427215091d37SBarry Smith */
42734a2ae208SSatish Balay #undef __FUNCT__
42744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4275dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
427615091d37SBarry Smith {
427715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4278690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4279dfbe8321SBarry Smith   PetscErrorCode    ierr;
4280690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4281d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4282d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4283d9fead3dSBarry Smith   const PetscScalar *b;
4284690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
428515091d37SBarry Smith 
428615091d37SBarry Smith   PetscFunctionBegin;
4287d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
428915091d37SBarry Smith 
429015091d37SBarry Smith   /* forward solve the lower triangular */
429115091d37SBarry Smith   idx    = 0;
429215091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
429315091d37SBarry Smith   for (i=1; i<n; i++) {
429415091d37SBarry Smith     v     =  aa      + 9*ai[i];
429515091d37SBarry Smith     vi    =  aj      + ai[i];
429615091d37SBarry Smith     nz    =  diag[i] - ai[i];
429715091d37SBarry Smith     idx   +=  3;
4298f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
429915091d37SBarry Smith     while (nz--) {
430015091d37SBarry Smith       jdx   = 3*(*vi++);
430115091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4302f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4303f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4304f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
430515091d37SBarry Smith       v    += 9;
430615091d37SBarry Smith     }
4307f1af5d2fSBarry Smith     x[idx]   = s1;
4308f1af5d2fSBarry Smith     x[1+idx] = s2;
4309f1af5d2fSBarry Smith     x[2+idx] = s3;
431015091d37SBarry Smith   }
431115091d37SBarry Smith   /* backward solve the upper triangular */
431215091d37SBarry Smith   for (i=n-1; i>=0; i--){
431315091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
431415091d37SBarry Smith     vi   = aj + diag[i] + 1;
431515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
431615091d37SBarry Smith     idt  = 3*i;
4317f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4318f1af5d2fSBarry Smith     s3 = x[2+idt];
431915091d37SBarry Smith     while (nz--) {
432015091d37SBarry Smith       idx   = 3*(*vi++);
432115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4322f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4323f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4324f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
432515091d37SBarry Smith       v    += 9;
432615091d37SBarry Smith     }
432715091d37SBarry Smith     v        = aa +  9*diag[i];
4328f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4329f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4330f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
433115091d37SBarry Smith   }
433215091d37SBarry Smith 
4333d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43341ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4335dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
433615091d37SBarry Smith   PetscFunctionReturn(0);
433715091d37SBarry Smith }
433815091d37SBarry Smith 
43394a2ae208SSatish Balay #undef __FUNCT__
4340cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4341cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4342cee9d6f2SShri Abhyankar {
4343cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4344ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4345cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4346cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
4347cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4348cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4349cee9d6f2SShri Abhyankar     PetscScalar       *x;
4350cee9d6f2SShri Abhyankar     const PetscScalar *b;
4351cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4352cee9d6f2SShri Abhyankar 
4353cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4354cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4355cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4356cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4357cee9d6f2SShri Abhyankar     idx    = 0;
4358cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4359cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4360cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
4361cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4362cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4363cee9d6f2SShri Abhyankar       idx   = bs*i;
4364cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4365ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4366ce3d78c0SShri Abhyankar          jdx   = bs*vi[k];
4367cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4368cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4369cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4370cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4371cee9d6f2SShri Abhyankar 
4372cee9d6f2SShri Abhyankar           v   +=  bs2;
4373cee9d6f2SShri Abhyankar         }
4374cee9d6f2SShri Abhyankar 
4375cee9d6f2SShri Abhyankar        x[idx]   = s1;
4376cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4377cee9d6f2SShri Abhyankar        x[2+idx] = s3;
4378cee9d6f2SShri Abhyankar     }
4379cee9d6f2SShri Abhyankar 
4380cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4381cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4382cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
4383cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4384cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4385cee9d6f2SShri Abhyankar      idt = bs*i;
4386cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4387cee9d6f2SShri Abhyankar 
4388ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4389ce3d78c0SShri Abhyankar        idx   = bs*vi[k];
4390cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4391cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4392cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4393cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4394cee9d6f2SShri Abhyankar 
4395cee9d6f2SShri Abhyankar         v   +=  bs2;
4396cee9d6f2SShri Abhyankar     }
4397cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4398cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4399cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4400cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4401cee9d6f2SShri Abhyankar 
4402cee9d6f2SShri Abhyankar   }
4403cee9d6f2SShri Abhyankar 
4404cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4405cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4406cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4407cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4408cee9d6f2SShri Abhyankar }
4409cee9d6f2SShri Abhyankar 
4410cee9d6f2SShri Abhyankar #undef __FUNCT__
4411b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4412b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4413b2b2dd24SShri Abhyankar {
4414b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4415b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4416b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4417b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
4418b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4419b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4420b2b2dd24SShri Abhyankar     PetscScalar       *x;
4421b2b2dd24SShri Abhyankar     const PetscScalar *b;
4422b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4423b2b2dd24SShri Abhyankar 
4424b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4425b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4426b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4427b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4428b2b2dd24SShri Abhyankar     idx    = 0;
4429b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4430b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4431b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4432b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4433b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4434b2b2dd24SShri Abhyankar       idx   = bs*i;
4435b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4436b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4437b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4438b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4439b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4440b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4441b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4442b2b2dd24SShri Abhyankar 
4443b2b2dd24SShri Abhyankar           v   +=  bs2;
4444b2b2dd24SShri Abhyankar         }
4445b2b2dd24SShri Abhyankar 
4446b2b2dd24SShri Abhyankar        x[idx]   = s1;
4447b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4448b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4449b2b2dd24SShri Abhyankar     }
4450b2b2dd24SShri Abhyankar 
4451b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4452b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4453b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4454b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4455b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4456b2b2dd24SShri Abhyankar      idt = bs*i;
4457b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4458b2b2dd24SShri Abhyankar 
4459b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4460b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4461b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4462b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4463b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4464b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4465b2b2dd24SShri Abhyankar 
4466b2b2dd24SShri Abhyankar         v   +=  bs2;
4467b2b2dd24SShri Abhyankar     }
4468b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4469b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4470b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4471b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4472b2b2dd24SShri Abhyankar 
4473b2b2dd24SShri Abhyankar   }
4474b2b2dd24SShri Abhyankar 
4475b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4476b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4477b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4478b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4479b2b2dd24SShri Abhyankar }
4480b2b2dd24SShri Abhyankar 
4481b2b2dd24SShri Abhyankar #undef __FUNCT__
44824a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4483dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
44844e2b4712SSatish Balay {
44854e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
44864e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
44876849ba73SBarry Smith   PetscErrorCode    ierr;
44885d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
44895d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4490d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4491d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4492d9fead3dSBarry Smith   const PetscScalar *b;
44934e2b4712SSatish Balay 
44944e2b4712SSatish Balay   PetscFunctionBegin;
4495d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44961ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4497f1af5d2fSBarry Smith   t  = a->solve_work;
44984e2b4712SSatish Balay 
44994e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
45004e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
45014e2b4712SSatish Balay 
45024e2b4712SSatish Balay   /* forward solve the lower triangular */
45034e2b4712SSatish Balay   idx    = 2*(*r++);
4504f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
45054e2b4712SSatish Balay   for (i=1; i<n; i++) {
45064e2b4712SSatish Balay     v     = aa + 4*ai[i];
45074e2b4712SSatish Balay     vi    = aj + ai[i];
45084e2b4712SSatish Balay     nz    = diag[i] - ai[i];
45094e2b4712SSatish Balay     idx   = 2*(*r++);
4510f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
45114e2b4712SSatish Balay     while (nz--) {
45124e2b4712SSatish Balay       idx   = 2*(*vi++);
4513f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4514f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4515f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
45164e2b4712SSatish Balay       v += 4;
45174e2b4712SSatish Balay     }
45184e2b4712SSatish Balay     idx = 2*i;
4519f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
45204e2b4712SSatish Balay   }
45214e2b4712SSatish Balay   /* backward solve the upper triangular */
45224e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
45234e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
45244e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
45254e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
45264e2b4712SSatish Balay     idt  = 2*i;
4527f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
45284e2b4712SSatish Balay     while (nz--) {
45294e2b4712SSatish Balay       idx   = 2*(*vi++);
4530f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4531f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4532f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
45334e2b4712SSatish Balay       v += 4;
45344e2b4712SSatish Balay     }
45354e2b4712SSatish Balay     idc = 2*(*c--);
45364e2b4712SSatish Balay     v   = aa + 4*diag[i];
4537f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4538f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
45394e2b4712SSatish Balay   }
45404e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45414e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4542d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4544dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
45454e2b4712SSatish Balay   PetscFunctionReturn(0);
45464e2b4712SSatish Balay }
45474e2b4712SSatish Balay 
45488f690400SShri Abhyankar #undef __FUNCT__
45498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
45508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
45518f690400SShri Abhyankar {
45528f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
45538f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
45548f690400SShri Abhyankar   PetscErrorCode    ierr;
455529b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
45568f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
45578f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
45588f690400SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
45598f690400SShri Abhyankar   const PetscScalar *b;
45608f690400SShri Abhyankar 
45618f690400SShri Abhyankar   PetscFunctionBegin;
45628f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45638f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
45648f690400SShri Abhyankar   t  = a->solve_work;
45658f690400SShri Abhyankar 
45668f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
456729b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
45688f690400SShri Abhyankar 
45698f690400SShri Abhyankar   /* forward solve the lower triangular */
457029b92fc1SShri Abhyankar   idx    = 2*r[0];
45718f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
45728f690400SShri Abhyankar   for (i=1; i<n; i++) {
45738f690400SShri Abhyankar     v     = aa + 4*ai[i];
45748f690400SShri Abhyankar     vi    = aj + ai[i];
45758f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
457629b92fc1SShri Abhyankar     idx   = 2*r[i];
45778f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
457829b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
457929b92fc1SShri Abhyankar       jdx   = 2*vi[m];
45808f690400SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
45818f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
45828f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
45838f690400SShri Abhyankar       v += 4;
45848f690400SShri Abhyankar     }
45858f690400SShri Abhyankar     idx = 2*i;
45868f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
45878f690400SShri Abhyankar   }
45888f690400SShri Abhyankar   /* backward solve the upper triangular */
45898f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
45908f690400SShri Abhyankar     k = 2*n-i;
45918f690400SShri Abhyankar     v    = aa + 4*ai[k];
45928f690400SShri Abhyankar     vi   = aj + ai[k];
45938f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
45948f690400SShri Abhyankar     idt  = 2*i;
45958f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
459629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
459729b92fc1SShri Abhyankar       idx   = 2*vi[m];
45988f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
45998f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
46008f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
46018f690400SShri Abhyankar       v += 4;
46028f690400SShri Abhyankar     }
460329b92fc1SShri Abhyankar     idc = 2*c[i];
46048f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
46058f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
46068f690400SShri Abhyankar   }
46078f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
46088f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
46098f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46108f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
46118f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
46128f690400SShri Abhyankar   PetscFunctionReturn(0);
46138f690400SShri Abhyankar }
46148f690400SShri Abhyankar 
46150c4413a7SShri Abhyankar #undef __FUNCT__
46160c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
46170c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
46180c4413a7SShri Abhyankar {
46190c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
46200c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
46210c4413a7SShri Abhyankar   PetscErrorCode    ierr;
46220c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
46230c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
46240c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
46250c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
46260c4413a7SShri Abhyankar   const PetscScalar *b;
46270c4413a7SShri Abhyankar 
46280c4413a7SShri Abhyankar   PetscFunctionBegin;
46290c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46300c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
46310c4413a7SShri Abhyankar   t  = a->solve_work;
46320c4413a7SShri Abhyankar 
46330c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
46340c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
46350c4413a7SShri Abhyankar 
46360c4413a7SShri Abhyankar   /* forward solve the lower triangular */
46370c4413a7SShri Abhyankar   idx    = 2*r[0];
46380c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
46390c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
46400c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
46410c4413a7SShri Abhyankar     vi    = aj + ai[i];
46420c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
46430c4413a7SShri Abhyankar     idx   = 2*r[i];
46440c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
46450c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
46460c4413a7SShri Abhyankar       jdx   = 2*vi[m];
46470c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
46480c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
46490c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
46500c4413a7SShri Abhyankar       v += 4;
46510c4413a7SShri Abhyankar     }
46520c4413a7SShri Abhyankar     idx = 2*i;
46530c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
46540c4413a7SShri Abhyankar   }
46550c4413a7SShri Abhyankar   /* backward solve the upper triangular */
46560c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
46570c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
46580c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
46590c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
46600c4413a7SShri Abhyankar     idt  = 2*i;
46610c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
46620c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
46630c4413a7SShri Abhyankar       idx   = 2*vi[m];
46640c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
46650c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
46660c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
46670c4413a7SShri Abhyankar       v += 4;
46680c4413a7SShri Abhyankar     }
46690c4413a7SShri Abhyankar     idc = 2*c[i];
46700c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
46710c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
46720c4413a7SShri Abhyankar   }
46730c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
46740c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
46750c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46760c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
46770c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
46780c4413a7SShri Abhyankar   PetscFunctionReturn(0);
46790c4413a7SShri Abhyankar }
46808f690400SShri Abhyankar 
468115091d37SBarry Smith /*
468215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
468315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
468415091d37SBarry Smith */
46854a2ae208SSatish Balay #undef __FUNCT__
46864a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4687dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
468815091d37SBarry Smith {
468915091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4690690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4691dfbe8321SBarry Smith   PetscErrorCode    ierr;
4692690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4693d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4694d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
4695d9fead3dSBarry Smith   const PetscScalar *b;
4696690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
469715091d37SBarry Smith 
469815091d37SBarry Smith   PetscFunctionBegin;
4699d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47001ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
470115091d37SBarry Smith 
470215091d37SBarry Smith   /* forward solve the lower triangular */
470315091d37SBarry Smith   idx    = 0;
470415091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
470515091d37SBarry Smith   for (i=1; i<n; i++) {
470615091d37SBarry Smith     v     =  aa      + 4*ai[i];
470715091d37SBarry Smith     vi    =  aj      + ai[i];
470815091d37SBarry Smith     nz    =  diag[i] - ai[i];
470915091d37SBarry Smith     idx   +=  2;
4710f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
471115091d37SBarry Smith     while (nz--) {
471215091d37SBarry Smith       jdx   = 2*(*vi++);
471315091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
4714f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4715f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
471615091d37SBarry Smith       v    += 4;
471715091d37SBarry Smith     }
4718f1af5d2fSBarry Smith     x[idx]   = s1;
4719f1af5d2fSBarry Smith     x[1+idx] = s2;
472015091d37SBarry Smith   }
472115091d37SBarry Smith   /* backward solve the upper triangular */
472215091d37SBarry Smith   for (i=n-1; i>=0; i--){
472315091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
472415091d37SBarry Smith     vi   = aj + diag[i] + 1;
472515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
472615091d37SBarry Smith     idt  = 2*i;
4727f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
472815091d37SBarry Smith     while (nz--) {
472915091d37SBarry Smith       idx   = 2*(*vi++);
473015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
4731f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4732f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
473315091d37SBarry Smith       v    += 4;
473415091d37SBarry Smith     }
473515091d37SBarry Smith     v        = aa +  4*diag[i];
4736f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
4737f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
473815091d37SBarry Smith   }
473915091d37SBarry Smith 
4740d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47411ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4742dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
474315091d37SBarry Smith   PetscFunctionReturn(0);
474415091d37SBarry Smith }
474515091d37SBarry Smith 
47464a2ae208SSatish Balay #undef __FUNCT__
4747cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4748cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4749cee9d6f2SShri Abhyankar {
4750cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4751ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4752cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4753cee9d6f2SShri Abhyankar     PetscInt          jdx;
4754cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4755cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4756cee9d6f2SShri Abhyankar     const PetscScalar *b;
4757cee9d6f2SShri Abhyankar 
4758cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4759cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4760cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4761cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4762cee9d6f2SShri Abhyankar     idx    = 0;
4763cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4764cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4765cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
4766cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4767cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4768cee9d6f2SShri Abhyankar        idx  = 2*i;
4769cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4770ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4771ce3d78c0SShri Abhyankar          jdx   = 2*vi[k];
4772cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4773cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4774cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4775cee9d6f2SShri Abhyankar            v   +=  4;
4776cee9d6f2SShri Abhyankar         }
4777cee9d6f2SShri Abhyankar        x[idx]   = s1;
4778cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4779cee9d6f2SShri Abhyankar     }
4780cee9d6f2SShri Abhyankar 
4781cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4782cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4783cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
4784cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4785cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4786cee9d6f2SShri Abhyankar      idt = 2*i;
4787cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4788ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4789ce3d78c0SShri Abhyankar       idx   = 2*vi[k];
4790cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4791cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4792cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4793cee9d6f2SShri Abhyankar          v    += 4;
4794cee9d6f2SShri Abhyankar     }
4795cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4796cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4797cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4798cee9d6f2SShri Abhyankar   }
4799cee9d6f2SShri Abhyankar 
4800cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4801cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4802cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4803cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4804cee9d6f2SShri Abhyankar }
4805cee9d6f2SShri Abhyankar 
4806cee9d6f2SShri Abhyankar #undef __FUNCT__
4807b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4808b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4809b2b2dd24SShri Abhyankar {
4810b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4811b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4812b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4813b2b2dd24SShri Abhyankar     PetscInt          jdx;
4814b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4815b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4816b2b2dd24SShri Abhyankar     const PetscScalar *b;
4817b2b2dd24SShri Abhyankar 
4818b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4819b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4820b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4821b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4822b2b2dd24SShri Abhyankar     idx    = 0;
4823b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4824b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4825b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4826b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4827b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4828b2b2dd24SShri Abhyankar        idx  = 2*i;
4829b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4830b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4831b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4832b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4833b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4834b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4835b2b2dd24SShri Abhyankar            v   +=  4;
4836b2b2dd24SShri Abhyankar         }
4837b2b2dd24SShri Abhyankar        x[idx]   = s1;
4838b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4839b2b2dd24SShri Abhyankar     }
4840b2b2dd24SShri Abhyankar 
4841b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4842b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4843b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4844b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4845b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4846b2b2dd24SShri Abhyankar      idt = 2*i;
4847b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4848b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4849b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4850b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4851b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4852b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4853b2b2dd24SShri Abhyankar          v    += 4;
4854b2b2dd24SShri Abhyankar     }
4855b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4856b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4857b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4858b2b2dd24SShri Abhyankar   }
4859b2b2dd24SShri Abhyankar 
4860b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4861b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4862b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4863b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4864b2b2dd24SShri Abhyankar }
4865b2b2dd24SShri Abhyankar 
4866b2b2dd24SShri Abhyankar #undef __FUNCT__
48674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4868dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
48694e2b4712SSatish Balay {
48704e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
48714e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
48726849ba73SBarry Smith   PetscErrorCode ierr;
48735d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
48745d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
48753f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
487687828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
48774e2b4712SSatish Balay 
48784e2b4712SSatish Balay   PetscFunctionBegin;
48794e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
48804e2b4712SSatish Balay 
48811ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
48821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4883f1af5d2fSBarry Smith   t  = a->solve_work;
48844e2b4712SSatish Balay 
48854e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48864e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
48874e2b4712SSatish Balay 
48884e2b4712SSatish Balay   /* forward solve the lower triangular */
4889f1af5d2fSBarry Smith   t[0] = b[*r++];
48904e2b4712SSatish Balay   for (i=1; i<n; i++) {
48914e2b4712SSatish Balay     v     = aa + ai[i];
48924e2b4712SSatish Balay     vi    = aj + ai[i];
48934e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4894f1af5d2fSBarry Smith     s1  = b[*r++];
48954e2b4712SSatish Balay     while (nz--) {
4896f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
48974e2b4712SSatish Balay     }
4898f1af5d2fSBarry Smith     t[i] = s1;
48994e2b4712SSatish Balay   }
49004e2b4712SSatish Balay   /* backward solve the upper triangular */
49014e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
49024e2b4712SSatish Balay     v    = aa + diag[i] + 1;
49034e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
49044e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4905f1af5d2fSBarry Smith     s1 = t[i];
49064e2b4712SSatish Balay     while (nz--) {
4907f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
49084e2b4712SSatish Balay     }
4909f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
49104e2b4712SSatish Balay   }
49114e2b4712SSatish Balay 
49124e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
49134e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
49141ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49151ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4916dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
49174e2b4712SSatish Balay   PetscFunctionReturn(0);
49184e2b4712SSatish Balay }
491915091d37SBarry Smith /*
492015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
492115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
492215091d37SBarry Smith */
49234a2ae208SSatish Balay #undef __FUNCT__
49244a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4925dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
492615091d37SBarry Smith {
492715091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4928690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4929dfbe8321SBarry Smith   PetscErrorCode ierr;
4930690b6cddSBarry Smith   PetscInt       *diag = a->diag;
493115091d37SBarry Smith   MatScalar      *aa=a->a;
493287828ca2SBarry Smith   PetscScalar    *x,*b;
493387828ca2SBarry Smith   PetscScalar    s1,x1;
493415091d37SBarry Smith   MatScalar      *v;
4935690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
493615091d37SBarry Smith 
493715091d37SBarry Smith   PetscFunctionBegin;
49381ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
49391ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
494015091d37SBarry Smith 
494115091d37SBarry Smith   /* forward solve the lower triangular */
494215091d37SBarry Smith   idx    = 0;
494315091d37SBarry Smith   x[0]   = b[0];
494415091d37SBarry Smith   for (i=1; i<n; i++) {
494515091d37SBarry Smith     v     =  aa      + ai[i];
494615091d37SBarry Smith     vi    =  aj      + ai[i];
494715091d37SBarry Smith     nz    =  diag[i] - ai[i];
494815091d37SBarry Smith     idx   +=  1;
4949f1af5d2fSBarry Smith     s1  =  b[idx];
495015091d37SBarry Smith     while (nz--) {
495115091d37SBarry Smith       jdx   = *vi++;
495215091d37SBarry Smith       x1    = x[jdx];
4953f1af5d2fSBarry Smith       s1 -= v[0]*x1;
495415091d37SBarry Smith       v    += 1;
495515091d37SBarry Smith     }
4956f1af5d2fSBarry Smith     x[idx]   = s1;
495715091d37SBarry Smith   }
495815091d37SBarry Smith   /* backward solve the upper triangular */
495915091d37SBarry Smith   for (i=n-1; i>=0; i--){
496015091d37SBarry Smith     v    = aa + diag[i] + 1;
496115091d37SBarry Smith     vi   = aj + diag[i] + 1;
496215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
496315091d37SBarry Smith     idt  = i;
4964f1af5d2fSBarry Smith     s1 = x[idt];
496515091d37SBarry Smith     while (nz--) {
496615091d37SBarry Smith       idx   = *vi++;
496715091d37SBarry Smith       x1    = x[idx];
4968f1af5d2fSBarry Smith       s1 -= v[0]*x1;
496915091d37SBarry Smith       v    += 1;
497015091d37SBarry Smith     }
497115091d37SBarry Smith     v        = aa +  diag[i];
4972f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
497315091d37SBarry Smith   }
49741ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
49751ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4976dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
497715091d37SBarry Smith   PetscFunctionReturn(0);
497815091d37SBarry Smith }
49794e2b4712SSatish Balay 
49804e2b4712SSatish Balay /* ----------------------------------------------------------------*/
498116a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
49826bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
49836bce7ff8SHong Zhang 
49846bce7ff8SHong Zhang #undef __FUNCT__
49856bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
49866bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
49876bce7ff8SHong Zhang {
49886bce7ff8SHong Zhang   Mat            C=B;
49896bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
49906bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
49916bce7ff8SHong Zhang   PetscErrorCode ierr;
49926bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
49936bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
49946bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4995b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4996914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4997914a18a2SHong Zhang   MatScalar      *v_work;
49986bce7ff8SHong Zhang 
49996bce7ff8SHong Zhang   PetscFunctionBegin;
50006bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
50016bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5002914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5003914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
50046bce7ff8SHong Zhang   ics  = ic;
50056bce7ff8SHong Zhang 
5006914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5007914a18a2SHong Zhang   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
5008b588c5a2SHong Zhang   mwork    = v_work + bs;
5009b588c5a2SHong Zhang   v_pivots = (PetscInt*)(mwork + bs2);
5010914a18a2SHong Zhang 
50116bce7ff8SHong Zhang   for (i=0; i<n; i++){
50126bce7ff8SHong Zhang     /* zero rtmp */
50136bce7ff8SHong Zhang     /* L part */
50146bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
50156bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5016914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5017914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5018914a18a2SHong Zhang     }
50196bce7ff8SHong Zhang 
50206bce7ff8SHong Zhang     /* U part */
50216bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
50226bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
5023914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5024914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5025914a18a2SHong Zhang     }
50266bce7ff8SHong Zhang 
50276bce7ff8SHong Zhang     /* load in initial (unfactored row) */
50286bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
50296bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
5030914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
50316bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
5032914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
50336bce7ff8SHong Zhang     }
50346bce7ff8SHong Zhang 
50356bce7ff8SHong Zhang     /* elimination */
50366bce7ff8SHong Zhang     bjtmp = bj + bi[i];
50376bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
5038b1646270SShri Abhyankar     for(k=0;k < nzL;k++) {
5039b1646270SShri Abhyankar       row = bjtmp[k];
5040914a18a2SHong Zhang       pc = rtmp + bs2*row;
5041914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
5042914a18a2SHong Zhang       if (flg) {
5043914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
5044b588c5a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
50456bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
5046914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
50476bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
5048914a18a2SHong Zhang         for (j=0; j<nz; j++) {
5049914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
5050914a18a2SHong Zhang         }
5051b588c5a2SHong Zhang         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
50526bce7ff8SHong Zhang       }
50536bce7ff8SHong Zhang     }
50546bce7ff8SHong Zhang 
50556bce7ff8SHong Zhang     /* finished row so stick it into b->a */
50566bce7ff8SHong Zhang     /* L part */
5057914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
50586bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
50596bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
50606bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
5061914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
50626bce7ff8SHong Zhang     }
50636bce7ff8SHong Zhang 
50646bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
5065914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
50666bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
5067914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
5068914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5069914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
50706bce7ff8SHong Zhang 
50716bce7ff8SHong Zhang     /* U part */
5072914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
50736bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
50746bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
5075914a18a2SHong Zhang     for (j=0; j<nz; j++){
5076914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5077914a18a2SHong Zhang     }
50786bce7ff8SHong Zhang   }
50796bce7ff8SHong Zhang 
50806bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
50816bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
50826bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
50836bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
508427019359SHong Zhang 
50856bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
5086914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
50876bce7ff8SHong Zhang   PetscFunctionReturn(0);
50886bce7ff8SHong Zhang }
50896bce7ff8SHong Zhang 
50901a83e813SShri Abhyankar #undef __FUNCT__
50911a83e813SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2"
50921a83e813SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
50931a83e813SShri Abhyankar {
50941a83e813SShri Abhyankar   Mat            C=B;
50951a83e813SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
50961a83e813SShri Abhyankar   IS             isrow = b->row,isicol = b->icol;
50971a83e813SShri Abhyankar   PetscErrorCode ierr;
50981a83e813SShri Abhyankar   const PetscInt *r,*ic,*ics;
50991a83e813SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
51001a83e813SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
51011a83e813SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
51021a83e813SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
51031a83e813SShri Abhyankar   MatScalar      *v_work;
51041a83e813SShri Abhyankar 
51051a83e813SShri Abhyankar   PetscFunctionBegin;
51061a83e813SShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
51071a83e813SShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
51081a83e813SShri Abhyankar   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
51091a83e813SShri Abhyankar   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
51101a83e813SShri Abhyankar   ics  = ic;
51111a83e813SShri Abhyankar 
51121a83e813SShri Abhyankar   /* generate work space needed by dense LU factorization */
51131a83e813SShri Abhyankar   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
51141a83e813SShri Abhyankar   mwork    = v_work + bs;
51151a83e813SShri Abhyankar   v_pivots = (PetscInt*)(mwork + bs2);
51161a83e813SShri Abhyankar 
51171a83e813SShri Abhyankar   for (i=0; i<n; i++){
51181a83e813SShri Abhyankar     /* zero rtmp */
51191a83e813SShri Abhyankar     /* L part */
51201a83e813SShri Abhyankar     nz    = bi[i+1] - bi[i];
51211a83e813SShri Abhyankar     bjtmp = bj + bi[i];
51221a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
51231a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51241a83e813SShri Abhyankar     }
51251a83e813SShri Abhyankar 
51261a83e813SShri Abhyankar     /* U part */
51271a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
51281a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
51291a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
51301a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51311a83e813SShri Abhyankar     }
51321a83e813SShri Abhyankar 
51331a83e813SShri Abhyankar     /* load in initial (unfactored row) */
51341a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
51351a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
51361a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
51371a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
51381a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
51391a83e813SShri Abhyankar     }
51401a83e813SShri Abhyankar 
51411a83e813SShri Abhyankar     /* elimination */
51421a83e813SShri Abhyankar     bjtmp = bj + bi[i];
51431a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
51441a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
51451a83e813SShri Abhyankar       row = bjtmp[k];
51461a83e813SShri Abhyankar       pc = rtmp + bs2*row;
51471a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
51481a83e813SShri Abhyankar       if (flg) {
51491a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
51501a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
51511a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
51521a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
51531a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
51541a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
51551a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
51561a83e813SShri Abhyankar         }
51571a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
51581a83e813SShri Abhyankar       }
51591a83e813SShri Abhyankar     }
51601a83e813SShri Abhyankar 
51611a83e813SShri Abhyankar     /* finished row so stick it into b->a */
51621a83e813SShri Abhyankar     /* L part */
51631a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
51641a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
51651a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
51661a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
51671a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51681a83e813SShri Abhyankar     }
51691a83e813SShri Abhyankar 
51701a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
51711a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
51721a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
51731a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
51741a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51751a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
51761a83e813SShri Abhyankar 
51771a83e813SShri Abhyankar     /* U part */
51781a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
51791a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
51801a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
51811a83e813SShri Abhyankar     for (j=0; j<nz; j++){
51821a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51831a83e813SShri Abhyankar     }
51841a83e813SShri Abhyankar   }
51851a83e813SShri Abhyankar 
51861a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
51871a83e813SShri Abhyankar   ierr = PetscFree(v_work);CHKERRQ(ierr);
51881a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
51891a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
51901a83e813SShri Abhyankar 
51911a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
51921a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
51931a83e813SShri Abhyankar   PetscFunctionReturn(0);
51941a83e813SShri Abhyankar }
51951a83e813SShri Abhyankar 
51966bce7ff8SHong Zhang /*
51976bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
519816a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
519916a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
52006bce7ff8SHong Zhang */
52016bce7ff8SHong Zhang #undef __FUNCT__
52026bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
52036bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
52046bce7ff8SHong Zhang {
52056bce7ff8SHong Zhang 
52066bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
52076bce7ff8SHong Zhang   PetscErrorCode     ierr;
520816a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
520916a2bf60SHong Zhang   PetscInt           i,j,nz,*bi,*bj,*bdiag;
52106bce7ff8SHong Zhang 
52116bce7ff8SHong Zhang   PetscFunctionBegin;
521216a2bf60SHong Zhang   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
521316a2bf60SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
52146bce7ff8SHong Zhang   b    = (Mat_SeqBAIJ*)(fact)->data;
521516a2bf60SHong Zhang 
521616a2bf60SHong Zhang   /* allocate matrix arrays for new data structure */
521716a2bf60SHong Zhang   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
521816a2bf60SHong Zhang   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
521916a2bf60SHong Zhang   b->singlemalloc = PETSC_TRUE;
522016a2bf60SHong Zhang   if (!b->diag){
522116a2bf60SHong Zhang     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
522216a2bf60SHong Zhang   }
5223914a18a2SHong Zhang   bdiag = b->diag;
52246bce7ff8SHong Zhang 
522516a2bf60SHong Zhang   if (n > 0) {
522616a2bf60SHong Zhang     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
52276bce7ff8SHong Zhang   }
52286bce7ff8SHong Zhang 
52296bce7ff8SHong Zhang   /* set bi and bj with new data structure */
52306bce7ff8SHong Zhang   bi = b->i;
52316bce7ff8SHong Zhang   bj = b->j;
52326bce7ff8SHong Zhang 
52336bce7ff8SHong Zhang   /* L part */
52346bce7ff8SHong Zhang   bi[0] = 0;
523516a2bf60SHong Zhang   for (i=0; i<n; i++){
52366bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
5237914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
52386bce7ff8SHong Zhang     aj = a->j + ai[i];
52396bce7ff8SHong Zhang     for (j=0; j<nz; j++){
52406bce7ff8SHong Zhang       *bj = aj[j]; bj++;
52416bce7ff8SHong Zhang     }
52426bce7ff8SHong Zhang   }
52436bce7ff8SHong Zhang 
52446bce7ff8SHong Zhang   /* U part */
524516a2bf60SHong Zhang   bi[n+1] = bi[n];
524616a2bf60SHong Zhang   for (i=n-1; i>=0; i--){
52476bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
524816a2bf60SHong Zhang     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
52496bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
52506bce7ff8SHong Zhang     for (j=0; j<nz; j++){
52516bce7ff8SHong Zhang       *bj = aj[j]; bj++;
52526bce7ff8SHong Zhang     }
52536bce7ff8SHong Zhang     /* diag[i] */
52546bce7ff8SHong Zhang     *bj = i; bj++;
525516a2bf60SHong Zhang     bdiag[i] = bi[2*n-i+1]-1;
52566bce7ff8SHong Zhang   }
52576bce7ff8SHong Zhang   PetscFunctionReturn(0);
52586bce7ff8SHong Zhang }
52596bce7ff8SHong Zhang 
526016a2bf60SHong Zhang #undef __FUNCT__
5261*35aa4fcfSShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct_v2"
5262*35aa4fcfSShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct_v2(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5263*35aa4fcfSShri Abhyankar {
5264*35aa4fcfSShri Abhyankar 
5265*35aa4fcfSShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5266*35aa4fcfSShri Abhyankar   PetscErrorCode     ierr;
5267*35aa4fcfSShri Abhyankar   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
5268*35aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
5269*35aa4fcfSShri Abhyankar 
5270*35aa4fcfSShri Abhyankar   PetscFunctionBegin;
5271*35aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
5272*35aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
5273*35aa4fcfSShri Abhyankar 
5274*35aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
5275*35aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
5276*35aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5277*35aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
5278*35aa4fcfSShri Abhyankar   if (!b->diag){
5279*35aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
5280*35aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
5281*35aa4fcfSShri Abhyankar   }
5282*35aa4fcfSShri Abhyankar   bdiag = b->diag;
5283*35aa4fcfSShri Abhyankar 
5284*35aa4fcfSShri Abhyankar   if (n > 0) {
5285*35aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
5286*35aa4fcfSShri Abhyankar   }
5287*35aa4fcfSShri Abhyankar 
5288*35aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
5289*35aa4fcfSShri Abhyankar   bi = b->i;
5290*35aa4fcfSShri Abhyankar   bj = b->j;
5291*35aa4fcfSShri Abhyankar 
5292*35aa4fcfSShri Abhyankar   /* L part */
5293*35aa4fcfSShri Abhyankar   bi[0] = 0;
5294*35aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
5295*35aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
5296*35aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
5297*35aa4fcfSShri Abhyankar     aj = a->j + ai[i];
5298*35aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
5299*35aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
5300*35aa4fcfSShri Abhyankar     }
5301*35aa4fcfSShri Abhyankar   }
5302*35aa4fcfSShri Abhyankar 
5303*35aa4fcfSShri Abhyankar   /* U part */
5304*35aa4fcfSShri Abhyankar   bi_temp = bi[n];
5305*35aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
5306*35aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
5307*35aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
5308*35aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
5309*35aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
5310*35aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
5311*35aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
5312*35aa4fcfSShri Abhyankar     }
5313*35aa4fcfSShri Abhyankar     /* diag[i] */
5314*35aa4fcfSShri Abhyankar     *bj = i; bj++;
5315*35aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
5316*35aa4fcfSShri Abhyankar   }
5317*35aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
5318*35aa4fcfSShri Abhyankar }
5319*35aa4fcfSShri Abhyankar 
5320*35aa4fcfSShri Abhyankar #undef __FUNCT__
532116a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
532216a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
532316a2bf60SHong Zhang {
532416a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
532516a2bf60SHong Zhang   IS                 isicol;
532616a2bf60SHong Zhang   PetscErrorCode     ierr;
532716a2bf60SHong Zhang   const PetscInt     *r,*ic;
53287fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
532916a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
533016a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
533116a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
53327fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
533316a2bf60SHong Zhang   PetscReal          f;
533416a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
533516a2bf60SHong Zhang   PetscBT            lnkbt;
533616a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
533716a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
533816a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
533916a2bf60SHong Zhang   PetscTruth         missing;
53407fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
534116a2bf60SHong Zhang 
534216a2bf60SHong Zhang   PetscFunctionBegin;
534316a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
534416a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
534516a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
534616a2bf60SHong Zhang 
534716a2bf60SHong Zhang   f             = info->fill;
534816a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
534916a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
535016a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
535116a2bf60SHong Zhang 
535216a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
535316a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
53547fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
535516a2bf60SHong Zhang 
53567fa3a6a0SHong Zhang   if (!levels && both_identity) {
535716a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
535816a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
535916a2bf60SHong Zhang     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
53607fa3a6a0SHong Zhang     /* set MatSolve routines */
53617fa3a6a0SHong Zhang     switch (bs){
53627fa3a6a0SHong Zhang     case 2:
53637fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
53647fa3a6a0SHong Zhang       break;
53657fa3a6a0SHong Zhang     case 3:
53667fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
53677fa3a6a0SHong Zhang       break;
53687fa3a6a0SHong Zhang     case 4:
53697fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
53707fa3a6a0SHong Zhang       break;
53717fa3a6a0SHong Zhang     case 5:
53727fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
53737fa3a6a0SHong Zhang       break;
53747fa3a6a0SHong Zhang     case 6:
53757fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
53767fa3a6a0SHong Zhang       break;
53777fa3a6a0SHong Zhang     case 7:
53787fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
53797fa3a6a0SHong Zhang       break;
53807fa3a6a0SHong Zhang     default:
53817fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
53827fa3a6a0SHong Zhang       break;
53837fa3a6a0SHong Zhang     }
538416a2bf60SHong Zhang 
538516a2bf60SHong Zhang     fact->factor = MAT_FACTOR_ILU;
538616a2bf60SHong Zhang     (fact)->info.factor_mallocs    = 0;
538716a2bf60SHong Zhang     (fact)->info.fill_ratio_given  = info->fill;
538816a2bf60SHong Zhang     (fact)->info.fill_ratio_needed = 1.0;
538916a2bf60SHong Zhang     b                = (Mat_SeqBAIJ*)(fact)->data;
539016a2bf60SHong Zhang     b->row           = isrow;
539116a2bf60SHong Zhang     b->col           = iscol;
539216a2bf60SHong Zhang     b->icol          = isicol;
539316a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
539416a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
539516a2bf60SHong Zhang     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5396b588c5a2SHong Zhang     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
539716a2bf60SHong Zhang     PetscFunctionReturn(0);
539816a2bf60SHong Zhang   }
539916a2bf60SHong Zhang 
540016a2bf60SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
540116a2bf60SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
540216a2bf60SHong Zhang 
540316a2bf60SHong Zhang   /* get new row pointers */
540416a2bf60SHong Zhang   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
540516a2bf60SHong Zhang   bi[0] = 0;
540616a2bf60SHong Zhang   /* bdiag is location of diagonal in factor */
540716a2bf60SHong Zhang   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
540816a2bf60SHong Zhang   bdiag[0]  = 0;
540916a2bf60SHong Zhang 
541016a2bf60SHong Zhang   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
541116a2bf60SHong Zhang   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
541216a2bf60SHong Zhang 
541316a2bf60SHong Zhang   /* create a linked list for storing column indices of the active row */
541416a2bf60SHong Zhang   nlnk = n + 1;
541516a2bf60SHong Zhang   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
541616a2bf60SHong Zhang 
541716a2bf60SHong Zhang   /* initial FreeSpace size is f*(ai[n]+1) */
541816a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
541916a2bf60SHong Zhang   current_space = free_space;
542016a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
542116a2bf60SHong Zhang   current_space_lvl = free_space_lvl;
542216a2bf60SHong Zhang 
542316a2bf60SHong Zhang   for (i=0; i<n; i++) {
542416a2bf60SHong Zhang     nzi = 0;
542516a2bf60SHong Zhang     /* copy current row into linked list */
542616a2bf60SHong Zhang     nnz  = ai[r[i]+1] - ai[r[i]];
542716a2bf60SHong Zhang     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
542816a2bf60SHong Zhang     cols = aj + ai[r[i]];
542916a2bf60SHong Zhang     lnk[i] = -1; /* marker to indicate if diagonal exists */
543016a2bf60SHong Zhang     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
543116a2bf60SHong Zhang     nzi += nlnk;
543216a2bf60SHong Zhang 
543316a2bf60SHong Zhang     /* make sure diagonal entry is included */
543416a2bf60SHong Zhang     if (diagonal_fill && lnk[i] == -1) {
543516a2bf60SHong Zhang       fm = n;
543616a2bf60SHong Zhang       while (lnk[fm] < i) fm = lnk[fm];
543716a2bf60SHong Zhang       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
543816a2bf60SHong Zhang       lnk[fm]    = i;
543916a2bf60SHong Zhang       lnk_lvl[i] = 0;
544016a2bf60SHong Zhang       nzi++; dcount++;
544116a2bf60SHong Zhang     }
544216a2bf60SHong Zhang 
544316a2bf60SHong Zhang     /* add pivot rows into the active row */
544416a2bf60SHong Zhang     nzbd = 0;
544516a2bf60SHong Zhang     prow = lnk[n];
544616a2bf60SHong Zhang     while (prow < i) {
544716a2bf60SHong Zhang       nnz      = bdiag[prow];
544816a2bf60SHong Zhang       cols     = bj_ptr[prow] + nnz + 1;
544916a2bf60SHong Zhang       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
545016a2bf60SHong Zhang       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
545116a2bf60SHong Zhang       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
545216a2bf60SHong Zhang       nzi += nlnk;
545316a2bf60SHong Zhang       prow = lnk[prow];
545416a2bf60SHong Zhang       nzbd++;
545516a2bf60SHong Zhang     }
545616a2bf60SHong Zhang     bdiag[i] = nzbd;
545716a2bf60SHong Zhang     bi[i+1]  = bi[i] + nzi;
545816a2bf60SHong Zhang 
545916a2bf60SHong Zhang     /* if free space is not available, make more free space */
546016a2bf60SHong Zhang     if (current_space->local_remaining<nzi) {
546116a2bf60SHong Zhang       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
546216a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
546316a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
546416a2bf60SHong Zhang       reallocs++;
546516a2bf60SHong Zhang     }
546616a2bf60SHong Zhang 
546716a2bf60SHong Zhang     /* copy data into free_space and free_space_lvl, then initialize lnk */
546816a2bf60SHong Zhang     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
546916a2bf60SHong Zhang     bj_ptr[i]    = current_space->array;
547016a2bf60SHong Zhang     bjlvl_ptr[i] = current_space_lvl->array;
547116a2bf60SHong Zhang 
547216a2bf60SHong Zhang     /* make sure the active row i has diagonal entry */
547316a2bf60SHong Zhang     if (*(bj_ptr[i]+bdiag[i]) != i) {
547416a2bf60SHong Zhang       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
547516a2bf60SHong Zhang     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
547616a2bf60SHong Zhang     }
547716a2bf60SHong Zhang 
547816a2bf60SHong Zhang     current_space->array           += nzi;
547916a2bf60SHong Zhang     current_space->local_used      += nzi;
548016a2bf60SHong Zhang     current_space->local_remaining -= nzi;
548116a2bf60SHong Zhang     current_space_lvl->array           += nzi;
548216a2bf60SHong Zhang     current_space_lvl->local_used      += nzi;
548316a2bf60SHong Zhang     current_space_lvl->local_remaining -= nzi;
548416a2bf60SHong Zhang   }
548516a2bf60SHong Zhang 
548616a2bf60SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
548716a2bf60SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
548816a2bf60SHong Zhang 
548916a2bf60SHong Zhang   /* destroy list of free space and other temporary arrays */
549016a2bf60SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
549116a2bf60SHong Zhang 
549216a2bf60SHong Zhang   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5493783ef271SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
549416a2bf60SHong Zhang 
549516a2bf60SHong Zhang   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
549616a2bf60SHong Zhang   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
549716a2bf60SHong Zhang   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
549816a2bf60SHong Zhang 
549916a2bf60SHong Zhang #if defined(PETSC_USE_INFO)
550016a2bf60SHong Zhang   {
550116a2bf60SHong Zhang     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
550216a2bf60SHong Zhang     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
550316a2bf60SHong Zhang     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
550416a2bf60SHong Zhang     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
550516a2bf60SHong Zhang     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
550616a2bf60SHong Zhang     if (diagonal_fill) {
550716a2bf60SHong Zhang       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
550816a2bf60SHong Zhang     }
550916a2bf60SHong Zhang   }
551016a2bf60SHong Zhang #endif
551116a2bf60SHong Zhang 
551216a2bf60SHong Zhang   /* put together the new matrix */
551316a2bf60SHong Zhang   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
551416a2bf60SHong Zhang   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
551516a2bf60SHong Zhang   b = (Mat_SeqBAIJ*)(fact)->data;
551616a2bf60SHong Zhang   b->free_a       = PETSC_TRUE;
551716a2bf60SHong Zhang   b->free_ij      = PETSC_TRUE;
551816a2bf60SHong Zhang   b->singlemalloc = PETSC_FALSE;
55197fa3a6a0SHong Zhang   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
552016a2bf60SHong Zhang   b->j          = bj;
552116a2bf60SHong Zhang   b->i          = bi;
552216a2bf60SHong Zhang   b->diag       = bdiag;
55237f53bb6cSHong Zhang   b->free_diag  = PETSC_TRUE;
552416a2bf60SHong Zhang   b->ilen       = 0;
552516a2bf60SHong Zhang   b->imax       = 0;
552616a2bf60SHong Zhang   b->row        = isrow;
552716a2bf60SHong Zhang   b->col        = iscol;
552816a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
552916a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
553016a2bf60SHong Zhang   b->icol       = isicol;
55317fa3a6a0SHong Zhang   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
553216a2bf60SHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
553316a2bf60SHong Zhang      Allocate bdiag, solve_work, new a, new j */
55347fa3a6a0SHong Zhang   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
553516a2bf60SHong Zhang   b->maxnz = b->nz = bi[2*n+1] ;
553616a2bf60SHong Zhang   (fact)->info.factor_mallocs    = reallocs;
553716a2bf60SHong Zhang   (fact)->info.fill_ratio_given  = f;
553816a2bf60SHong Zhang   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
553916a2bf60SHong Zhang   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
55407fa3a6a0SHong Zhang   /* set MatSolve routines */
55417fa3a6a0SHong Zhang   if (both_identity){
55427fa3a6a0SHong Zhang     switch (bs){
55437fa3a6a0SHong Zhang     case 2:
55447fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
55457fa3a6a0SHong Zhang       break;
55467fa3a6a0SHong Zhang     case 3:
55477fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
55487fa3a6a0SHong Zhang       break;
55497fa3a6a0SHong Zhang     case 4:
55507fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
55517fa3a6a0SHong Zhang       break;
55527fa3a6a0SHong Zhang     case 5:
55537fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
55547fa3a6a0SHong Zhang       break;
55557fa3a6a0SHong Zhang     case 6:
55567fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
55577fa3a6a0SHong Zhang       break;
55587fa3a6a0SHong Zhang     case 7:
55597fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
55607fa3a6a0SHong Zhang       break;
55617fa3a6a0SHong Zhang     default:
55627fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
55637fa3a6a0SHong Zhang       break;
55647fa3a6a0SHong Zhang     }
55657fa3a6a0SHong Zhang   } else {
55667fa3a6a0SHong Zhang     switch (bs){
55677fa3a6a0SHong Zhang     case 2:
55687fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
55697fa3a6a0SHong Zhang       break;
55707fa3a6a0SHong Zhang     case 3:
55717fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
55727fa3a6a0SHong Zhang       break;
55737fa3a6a0SHong Zhang     case 4:
55747fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
55757fa3a6a0SHong Zhang       break;
55767fa3a6a0SHong Zhang     case 5:
55777fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
55787fa3a6a0SHong Zhang       break;
55797fa3a6a0SHong Zhang     case 6:
55807fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
55817fa3a6a0SHong Zhang       break;
55827fa3a6a0SHong Zhang     case 7:
55837fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
55847fa3a6a0SHong Zhang       break;
55857fa3a6a0SHong Zhang     default:
55867fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
55877fa3a6a0SHong Zhang       break;
55887fa3a6a0SHong Zhang     }
55897fa3a6a0SHong Zhang   }
559016a2bf60SHong Zhang   PetscFunctionReturn(0);
559116a2bf60SHong Zhang }
559216a2bf60SHong Zhang 
5593*35aa4fcfSShri Abhyankar #undef __FUNCT__
5594*35aa4fcfSShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct_v2"
5595*35aa4fcfSShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct_v2(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
5596*35aa4fcfSShri Abhyankar {
5597*35aa4fcfSShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
5598*35aa4fcfSShri Abhyankar   IS                 isicol;
5599*35aa4fcfSShri Abhyankar   PetscErrorCode     ierr;
5600*35aa4fcfSShri Abhyankar   const PetscInt     *r,*ic;
5601*35aa4fcfSShri Abhyankar   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
5602*35aa4fcfSShri Abhyankar   PetscInt           *bi,*cols,nnz,*cols_lvl;
5603*35aa4fcfSShri Abhyankar   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
5604*35aa4fcfSShri Abhyankar   PetscInt           i,levels,diagonal_fill;
5605*35aa4fcfSShri Abhyankar   PetscTruth         col_identity,row_identity,both_identity;
5606*35aa4fcfSShri Abhyankar   PetscReal          f;
5607*35aa4fcfSShri Abhyankar   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
5608*35aa4fcfSShri Abhyankar   PetscBT            lnkbt;
5609*35aa4fcfSShri Abhyankar   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
5610*35aa4fcfSShri Abhyankar   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
5611*35aa4fcfSShri Abhyankar   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
5612*35aa4fcfSShri Abhyankar   PetscTruth         missing;
5613*35aa4fcfSShri Abhyankar   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5614*35aa4fcfSShri Abhyankar 
5615*35aa4fcfSShri Abhyankar   PetscFunctionBegin;
5616*35aa4fcfSShri Abhyankar   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5617*35aa4fcfSShri Abhyankar   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5618*35aa4fcfSShri Abhyankar   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
5619*35aa4fcfSShri Abhyankar 
5620*35aa4fcfSShri Abhyankar   f             = info->fill;
5621*35aa4fcfSShri Abhyankar   levels        = (PetscInt)info->levels;
5622*35aa4fcfSShri Abhyankar   diagonal_fill = (PetscInt)info->diagonal_fill;
5623*35aa4fcfSShri Abhyankar   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
5624*35aa4fcfSShri Abhyankar 
5625*35aa4fcfSShri Abhyankar   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5626*35aa4fcfSShri Abhyankar   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
5627*35aa4fcfSShri Abhyankar   both_identity = (PetscTruth) (row_identity && col_identity);
5628*35aa4fcfSShri Abhyankar 
5629*35aa4fcfSShri Abhyankar   if (!levels && both_identity) {
5630*35aa4fcfSShri Abhyankar     /* special case: ilu(0) with natural ordering */
5631*35aa4fcfSShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct_v2(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5632*35aa4fcfSShri Abhyankar     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2;
5633*35aa4fcfSShri Abhyankar     /* set MatSolve routines */
5634*35aa4fcfSShri Abhyankar     switch (bs){
5635*35aa4fcfSShri Abhyankar     case 2:
5636*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2;
5637*35aa4fcfSShri Abhyankar       break;
5638*35aa4fcfSShri Abhyankar     case 3:
5639*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2;
5640*35aa4fcfSShri Abhyankar       break;
5641*35aa4fcfSShri Abhyankar     case 4:
5642*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2;
5643*35aa4fcfSShri Abhyankar       break;
5644*35aa4fcfSShri Abhyankar     case 5:
5645*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2;
5646*35aa4fcfSShri Abhyankar       break;
5647*35aa4fcfSShri Abhyankar     case 6:
5648*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2;
5649*35aa4fcfSShri Abhyankar       break;
5650*35aa4fcfSShri Abhyankar     case 7:
5651*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2;
5652*35aa4fcfSShri Abhyankar       break;
5653*35aa4fcfSShri Abhyankar     default:
5654*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2;
5655*35aa4fcfSShri Abhyankar       break;
5656*35aa4fcfSShri Abhyankar     }
5657*35aa4fcfSShri Abhyankar 
5658*35aa4fcfSShri Abhyankar     fact->factor = MAT_FACTOR_ILU;
5659*35aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
5660*35aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
5661*35aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
5662*35aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
5663*35aa4fcfSShri Abhyankar     b->row           = isrow;
5664*35aa4fcfSShri Abhyankar     b->col           = iscol;
5665*35aa4fcfSShri Abhyankar     b->icol          = isicol;
5666*35aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5667*35aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5668*35aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5669*35aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5670*35aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
5671*35aa4fcfSShri Abhyankar   }
5672*35aa4fcfSShri Abhyankar 
5673*35aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
5674*35aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5675*35aa4fcfSShri Abhyankar 
5676*35aa4fcfSShri Abhyankar   /* get new row pointers */
5677*35aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
5678*35aa4fcfSShri Abhyankar   bi[0] = 0;
5679*35aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
5680*35aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
5681*35aa4fcfSShri Abhyankar   bdiag[0]  = 0;
5682*35aa4fcfSShri Abhyankar 
5683*35aa4fcfSShri Abhyankar   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
5684*35aa4fcfSShri Abhyankar   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
5685*35aa4fcfSShri Abhyankar 
5686*35aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
5687*35aa4fcfSShri Abhyankar   nlnk = n + 1;
5688*35aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5689*35aa4fcfSShri Abhyankar 
5690*35aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
5691*35aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
5692*35aa4fcfSShri Abhyankar   current_space = free_space;
5693*35aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
5694*35aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
5695*35aa4fcfSShri Abhyankar 
5696*35aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
5697*35aa4fcfSShri Abhyankar     nzi = 0;
5698*35aa4fcfSShri Abhyankar     /* copy current row into linked list */
5699*35aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
5700*35aa4fcfSShri Abhyankar     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
5701*35aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
5702*35aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
5703*35aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
5704*35aa4fcfSShri Abhyankar     nzi += nlnk;
5705*35aa4fcfSShri Abhyankar 
5706*35aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
5707*35aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
5708*35aa4fcfSShri Abhyankar       fm = n;
5709*35aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
5710*35aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
5711*35aa4fcfSShri Abhyankar       lnk[fm]    = i;
5712*35aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
5713*35aa4fcfSShri Abhyankar       nzi++; dcount++;
5714*35aa4fcfSShri Abhyankar     }
5715*35aa4fcfSShri Abhyankar 
5716*35aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
5717*35aa4fcfSShri Abhyankar     nzbd = 0;
5718*35aa4fcfSShri Abhyankar     prow = lnk[n];
5719*35aa4fcfSShri Abhyankar     while (prow < i) {
5720*35aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
5721*35aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
5722*35aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
5723*35aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
5724*35aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
5725*35aa4fcfSShri Abhyankar       nzi += nlnk;
5726*35aa4fcfSShri Abhyankar       prow = lnk[prow];
5727*35aa4fcfSShri Abhyankar       nzbd++;
5728*35aa4fcfSShri Abhyankar     }
5729*35aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
5730*35aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
5731*35aa4fcfSShri Abhyankar 
5732*35aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
5733*35aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
5734*35aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
5735*35aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
5736*35aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
5737*35aa4fcfSShri Abhyankar       reallocs++;
5738*35aa4fcfSShri Abhyankar     }
5739*35aa4fcfSShri Abhyankar 
5740*35aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
5741*35aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
5742*35aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
5743*35aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
5744*35aa4fcfSShri Abhyankar 
5745*35aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
5746*35aa4fcfSShri Abhyankar     if (*(bj_ptr[i]+bdiag[i]) != i) {
5747*35aa4fcfSShri Abhyankar       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
5748*35aa4fcfSShri Abhyankar     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
5749*35aa4fcfSShri Abhyankar     }
5750*35aa4fcfSShri Abhyankar 
5751*35aa4fcfSShri Abhyankar     current_space->array           += nzi;
5752*35aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
5753*35aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
5754*35aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
5755*35aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
5756*35aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
5757*35aa4fcfSShri Abhyankar   }
5758*35aa4fcfSShri Abhyankar 
5759*35aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
5760*35aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5761*35aa4fcfSShri Abhyankar 
5762*35aa4fcfSShri Abhyankar   /* destroy list of free space and other temporary arrays */
5763*35aa4fcfSShri Abhyankar   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
5764*35aa4fcfSShri Abhyankar 
5765*35aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5766*35aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
5767*35aa4fcfSShri Abhyankar 
5768*35aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
5769*35aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5770*35aa4fcfSShri Abhyankar   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
5771*35aa4fcfSShri Abhyankar 
5772*35aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
5773*35aa4fcfSShri Abhyankar   {
5774*35aa4fcfSShri Abhyankar     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
5775*35aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
5776*35aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5777*35aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
5778*35aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5779*35aa4fcfSShri Abhyankar     if (diagonal_fill) {
5780*35aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
5781*35aa4fcfSShri Abhyankar     }
5782*35aa4fcfSShri Abhyankar   }
5783*35aa4fcfSShri Abhyankar #endif
5784*35aa4fcfSShri Abhyankar 
5785*35aa4fcfSShri Abhyankar   /* put together the new matrix */
5786*35aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5787*35aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5788*35aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
5789*35aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
5790*35aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
5791*35aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
5792*35aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
5793*35aa4fcfSShri Abhyankar   b->j          = bj;
5794*35aa4fcfSShri Abhyankar   b->i          = bi;
5795*35aa4fcfSShri Abhyankar   b->diag       = bdiag;
5796*35aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
5797*35aa4fcfSShri Abhyankar   b->ilen       = 0;
5798*35aa4fcfSShri Abhyankar   b->imax       = 0;
5799*35aa4fcfSShri Abhyankar   b->row        = isrow;
5800*35aa4fcfSShri Abhyankar   b->col        = iscol;
5801*35aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5802*35aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5803*35aa4fcfSShri Abhyankar   b->icol       = isicol;
5804*35aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
5805*35aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
5806*35aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
5807*35aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
5808*35aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
5809*35aa4fcfSShri Abhyankar   (fact)->info.factor_mallocs    = reallocs;
5810*35aa4fcfSShri Abhyankar   (fact)->info.fill_ratio_given  = f;
5811*35aa4fcfSShri Abhyankar   (fact)->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5812*35aa4fcfSShri Abhyankar   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
5813*35aa4fcfSShri Abhyankar   /* set MatSolve routines */
5814*35aa4fcfSShri Abhyankar   if (both_identity){
5815*35aa4fcfSShri Abhyankar     switch (bs){
5816*35aa4fcfSShri Abhyankar     case 2:
5817*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2;
5818*35aa4fcfSShri Abhyankar       break;
5819*35aa4fcfSShri Abhyankar     case 3:
5820*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2;
5821*35aa4fcfSShri Abhyankar       break;
5822*35aa4fcfSShri Abhyankar     case 4:
5823*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2;
5824*35aa4fcfSShri Abhyankar       break;
5825*35aa4fcfSShri Abhyankar     case 5:
5826*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2;
5827*35aa4fcfSShri Abhyankar       break;
5828*35aa4fcfSShri Abhyankar     case 6:
5829*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2;
5830*35aa4fcfSShri Abhyankar       break;
5831*35aa4fcfSShri Abhyankar     case 7:
5832*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2;
5833*35aa4fcfSShri Abhyankar       break;
5834*35aa4fcfSShri Abhyankar     default:
5835*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct_v2;
5836*35aa4fcfSShri Abhyankar       break;
5837*35aa4fcfSShri Abhyankar     }
5838*35aa4fcfSShri Abhyankar   } else {
5839*35aa4fcfSShri Abhyankar     switch (bs){
5840*35aa4fcfSShri Abhyankar     case 2:
5841*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct_v2;
5842*35aa4fcfSShri Abhyankar       break;
5843*35aa4fcfSShri Abhyankar     case 3:
5844*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct_v2;
5845*35aa4fcfSShri Abhyankar       break;
5846*35aa4fcfSShri Abhyankar     case 4:
5847*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct_v2;
5848*35aa4fcfSShri Abhyankar       break;
5849*35aa4fcfSShri Abhyankar     case 5:
5850*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct_v2;
5851*35aa4fcfSShri Abhyankar       break;
5852*35aa4fcfSShri Abhyankar     case 6:
5853*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct_v2;
5854*35aa4fcfSShri Abhyankar       break;
5855*35aa4fcfSShri Abhyankar     case 7:
5856*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct_v2;
5857*35aa4fcfSShri Abhyankar       break;
5858*35aa4fcfSShri Abhyankar     default:
5859*35aa4fcfSShri Abhyankar       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct_v2;
5860*35aa4fcfSShri Abhyankar       break;
5861*35aa4fcfSShri Abhyankar     }
5862*35aa4fcfSShri Abhyankar   }
5863*35aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
5864*35aa4fcfSShri Abhyankar }
5865*35aa4fcfSShri Abhyankar 
5866*35aa4fcfSShri Abhyankar 
58674e2b4712SSatish Balay /*
58684e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
58694e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
58704e2b4712SSatish Balay    Not a good example of code reuse.
58714e2b4712SSatish Balay */
58724a2ae208SSatish Balay #undef __FUNCT__
58734a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
58740481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
58754e2b4712SSatish Balay {
58764e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
58774e2b4712SSatish Balay   IS             isicol;
58786849ba73SBarry Smith   PetscErrorCode ierr;
58795d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
58805d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5881a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5882d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
588341df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
5884329f5518SBarry Smith   PetscReal      f;
5885*35aa4fcfSShri Abhyankar   PetscTruth     newdatastruct=PETSC_FALSE,newdatastruct_v2=PETSC_FALSE;
58864e2b4712SSatish Balay 
58874e2b4712SSatish Balay   PetscFunctionBegin;
588816a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
588916a2bf60SHong Zhang   if (newdatastruct){
589016a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
589116a2bf60SHong Zhang     PetscFunctionReturn(0);
589216a2bf60SHong Zhang   }
5893*35aa4fcfSShri Abhyankar   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new_v2",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5894*35aa4fcfSShri Abhyankar   if (newdatastruct_v2){
5895*35aa4fcfSShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct_v2(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5896*35aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
5897*35aa4fcfSShri Abhyankar   }
589816a2bf60SHong Zhang 
58996bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
59006bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
59016bce7ff8SHong Zhang 
5902435faa5fSBarry Smith   f             = info->fill;
5903690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
5904690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
59054c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
590616a2bf60SHong Zhang 
5907667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5908667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
59097d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
5910309c388cSBarry Smith 
591141df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
591216a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
59136bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
59146bce7ff8SHong Zhang 
5915719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
5916719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
5917bb3d539aSBarry Smith     b->row       = isrow;
5918bb3d539aSBarry Smith     b->col       = iscol;
5919bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5920bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5921bb3d539aSBarry Smith     b->icol      = isicol;
5922bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5923b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
59246bce7ff8SHong Zhang     PetscFunctionReturn(0);
59256bce7ff8SHong Zhang   }
59266bce7ff8SHong Zhang 
59276bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
59284e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
59294e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
59304e2b4712SSatish Balay 
59314e2b4712SSatish Balay     /* get new row pointers */
5932690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
59334e2b4712SSatish Balay     ainew[0] = 0;
59344e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
5935690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
5936690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
59374e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
5938690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
59394e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
5940690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
59414e2b4712SSatish Balay     /* im is level for each filled value */
5942690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
59434e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
5944690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
59454e2b4712SSatish Balay     dloc[0]  = 0;
59464e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
5947435faa5fSBarry Smith 
5948435faa5fSBarry Smith       /* copy prow into linked list */
59494e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
59503b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
59514e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
59524e2b4712SSatish Balay       fill[n]    = n;
5953435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
59544e2b4712SSatish Balay       while (nz--) {
59554e2b4712SSatish Balay 	fm  = n;
59564e2b4712SSatish Balay 	idx = ic[*xi++];
59574e2b4712SSatish Balay 	do {
59584e2b4712SSatish Balay 	  m  = fm;
59594e2b4712SSatish Balay 	  fm = fill[m];
59604e2b4712SSatish Balay 	} while (fm < idx);
59614e2b4712SSatish Balay 	fill[m]   = idx;
59624e2b4712SSatish Balay 	fill[idx] = fm;
59634e2b4712SSatish Balay 	im[idx]   = 0;
59644e2b4712SSatish Balay       }
5965435faa5fSBarry Smith 
5966435faa5fSBarry Smith       /* make sure diagonal entry is included */
5967435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
5968435faa5fSBarry Smith 	fm = n;
5969435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
5970435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5971435faa5fSBarry Smith 	fill[fm]   = prow;
5972435faa5fSBarry Smith 	im[prow]   = 0;
5973435faa5fSBarry Smith 	nzf++;
5974335d9088SBarry Smith 	dcount++;
5975435faa5fSBarry Smith       }
5976435faa5fSBarry Smith 
59774e2b4712SSatish Balay       nzi = 0;
59784e2b4712SSatish Balay       row = fill[n];
59794e2b4712SSatish Balay       while (row < prow) {
59804e2b4712SSatish Balay 	incrlev = im[row] + 1;
59814e2b4712SSatish Balay 	nz      = dloc[row];
5982435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
59834e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
59844e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
59854e2b4712SSatish Balay 	fm      = row;
59864e2b4712SSatish Balay 	while (nnz-- > 0) {
59874e2b4712SSatish Balay 	  idx = *xi++;
59884e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
59894e2b4712SSatish Balay 	    flev++;
59904e2b4712SSatish Balay 	    continue;
59914e2b4712SSatish Balay 	  }
59924e2b4712SSatish Balay 	  do {
59934e2b4712SSatish Balay 	    m  = fm;
59944e2b4712SSatish Balay 	    fm = fill[m];
59954e2b4712SSatish Balay 	  } while (fm < idx);
59964e2b4712SSatish Balay 	  if (fm != idx) {
59974e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
59984e2b4712SSatish Balay 	    fill[m]   = idx;
59994e2b4712SSatish Balay 	    fill[idx] = fm;
60004e2b4712SSatish Balay 	    fm        = idx;
60014e2b4712SSatish Balay 	    nzf++;
6002ecf371e4SBarry Smith 	  } else {
60034e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
60044e2b4712SSatish Balay 	  }
60054e2b4712SSatish Balay 	  flev++;
60064e2b4712SSatish Balay 	}
60074e2b4712SSatish Balay 	row = fill[row];
60084e2b4712SSatish Balay 	nzi++;
60094e2b4712SSatish Balay       }
60104e2b4712SSatish Balay       /* copy new filled row into permanent storage */
60114e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
60124e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
6013ecf371e4SBarry Smith 
6014ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
6015ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6016ecf371e4SBarry Smith 	/* just double the memory each time */
6017690b6cddSBarry Smith 	PetscInt maxadd = jmax;
6018ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
60194e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
60204e2b4712SSatish Balay 	jmax += maxadd;
6021ecf371e4SBarry Smith 
6022ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
60235d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
60245d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6025606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
60265d0c19d7SBarry Smith 	ajnew = xitmp;
60275d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
60285d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6029606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
60305d0c19d7SBarry Smith 	ajfill = xitmp;
6031eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
60324e2b4712SSatish Balay       }
60335d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
60344e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
60354e2b4712SSatish Balay       dloc[prow]  = nzi;
60364e2b4712SSatish Balay       fm          = fill[n];
60374e2b4712SSatish Balay       while (nzf--) {
60385d0c19d7SBarry Smith 	*xitmp++ = fm;
60394e2b4712SSatish Balay 	*flev++ = im[fm];
60404e2b4712SSatish Balay 	fm      = fill[fm];
60414e2b4712SSatish Balay       }
6042435faa5fSBarry Smith       /* make sure row has diagonal entry */
6043435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
604477431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
60452401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6046435faa5fSBarry Smith       }
60474e2b4712SSatish Balay     }
6048606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
60494e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
60504e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6051606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
6052606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
60534e2b4712SSatish Balay 
60546cf91177SBarry Smith #if defined(PETSC_USE_INFO)
60554e2b4712SSatish Balay     {
6056329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6057ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6058ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6059ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6060ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6061335d9088SBarry Smith       if (diagonal_fill) {
6062ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6063335d9088SBarry Smith       }
60644e2b4712SSatish Balay     }
606563ba0a88SBarry Smith #endif
60664e2b4712SSatish Balay 
60674e2b4712SSatish Balay     /* put together the new matrix */
6068719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6069719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6070719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
6071e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
6072e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
60737c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
6074a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
60754e2b4712SSatish Balay     b->j          = ajnew;
60764e2b4712SSatish Balay     b->i          = ainew;
60774e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
60784e2b4712SSatish Balay     b->diag       = dloc;
60797f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
60804e2b4712SSatish Balay     b->ilen       = 0;
60814e2b4712SSatish Balay     b->imax       = 0;
60824e2b4712SSatish Balay     b->row        = isrow;
60834e2b4712SSatish Balay     b->col        = iscol;
6084bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6085c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6086c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6087e51c0b9cSSatish Balay     b->icol       = isicol;
608887828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
60894e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
60904e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
6091719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
60924e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
60934e2b4712SSatish Balay 
6094719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
6095719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
6096719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
60976bce7ff8SHong Zhang 
609841df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
60998661488fSKris Buschelman   PetscFunctionReturn(0);
61008661488fSKris Buschelman }
61018661488fSKris Buschelman 
6102732ee342SKris Buschelman #undef __FUNCT__
61037e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6104dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
61057e7071cdSKris Buschelman {
610612272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
610712272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
61085a9542e3SKris Buschelman   PetscFunctionBegin;
61097cf1b8d3SKris Buschelman   /* Undo Column scaling */
61107cf1b8d3SKris Buschelman /*    while (nz--) { */
61117cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
61127cf1b8d3SKris Buschelman /*    } */
6113c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
6114c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
61157cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
61167cf1b8d3SKris Buschelman }
61177cf1b8d3SKris Buschelman 
61187cf1b8d3SKris Buschelman #undef __FUNCT__
61197cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6120dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
61217cf1b8d3SKris Buschelman {
61227cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6123b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
61242aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
61255a9542e3SKris Buschelman   PetscFunctionBegin;
61260b9da03eSKris Buschelman   /* Is this really necessary? */
612720235379SKris Buschelman   while (nz--) {
61280b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
61297e7071cdSKris Buschelman   }
6130c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
61317e7071cdSKris Buschelman   PetscFunctionReturn(0);
61327e7071cdSKris Buschelman }
61337e7071cdSKris Buschelman 
6134732ee342SKris Buschelman 
6135