xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 78bb40077513d5120f4c52e0fb25a84efb280004)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
119dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
120f1af5d2fSBarry Smith {
121f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122dfbe8321SBarry Smith   PetscErrorCode ierr;
123690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
124690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
125f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12787828ca2SBarry Smith   PetscScalar    *x,*b;
128f1af5d2fSBarry Smith 
129f1af5d2fSBarry Smith   PetscFunctionBegin;
130ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
133f1af5d2fSBarry Smith 
134f1af5d2fSBarry Smith   /* forward solve the U^T */
135f1af5d2fSBarry Smith   idx = 0;
136f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
137f1af5d2fSBarry Smith 
138f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
139f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
140ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
141f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
142f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
143f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
144f1af5d2fSBarry Smith     v += 9;
145f1af5d2fSBarry Smith 
146f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
147f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
148f1af5d2fSBarry Smith     while (nz--) {
149f1af5d2fSBarry Smith       oidx = 3*(*vi++);
150f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
151f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
152f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
153f1af5d2fSBarry Smith       v  += 9;
154f1af5d2fSBarry Smith     }
155f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
156f1af5d2fSBarry Smith     idx += 3;
157f1af5d2fSBarry Smith   }
158f1af5d2fSBarry Smith   /* backward solve the L^T */
159f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
160f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
161f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
162f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
163f1af5d2fSBarry Smith     idt  = 3*i;
164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
165f1af5d2fSBarry Smith     while (nz--) {
166f1af5d2fSBarry Smith       idx   = 3*(*vi--);
167f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
168f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
169f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
170f1af5d2fSBarry Smith       v -= 9;
171f1af5d2fSBarry Smith     }
172f1af5d2fSBarry Smith   }
1731ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
175dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
176f1af5d2fSBarry Smith   PetscFunctionReturn(0);
177f1af5d2fSBarry Smith }
178f1af5d2fSBarry Smith 
1794a2ae208SSatish Balay #undef __FUNCT__
1804a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
181dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
182f1af5d2fSBarry Smith {
183f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
184dfbe8321SBarry Smith   PetscErrorCode ierr;
185690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
186690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
187f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18987828ca2SBarry Smith   PetscScalar    *x,*b;
190f1af5d2fSBarry Smith 
191f1af5d2fSBarry Smith   PetscFunctionBegin;
192ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1931ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
195f1af5d2fSBarry Smith 
196f1af5d2fSBarry Smith   /* forward solve the U^T */
197f1af5d2fSBarry Smith   idx = 0;
198f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
199f1af5d2fSBarry Smith 
200f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
201f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
202ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
203f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
204f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
205f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
206f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
207f1af5d2fSBarry Smith     v += 16;
208f1af5d2fSBarry Smith 
209f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
210f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
211f1af5d2fSBarry Smith     while (nz--) {
212f1af5d2fSBarry Smith       oidx = 4*(*vi++);
213f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
214f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
215f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
216f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
217f1af5d2fSBarry Smith       v  += 16;
218f1af5d2fSBarry Smith     }
219f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
220f1af5d2fSBarry Smith     idx += 4;
221f1af5d2fSBarry Smith   }
222f1af5d2fSBarry Smith   /* backward solve the L^T */
223f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
224f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
225f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
226f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
227f1af5d2fSBarry Smith     idt  = 4*i;
228f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
229f1af5d2fSBarry Smith     while (nz--) {
230f1af5d2fSBarry Smith       idx   = 4*(*vi--);
231f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
232f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
233f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
234f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
235f1af5d2fSBarry Smith       v -= 16;
236f1af5d2fSBarry Smith     }
237f1af5d2fSBarry Smith   }
2381ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2391ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
240dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
241f1af5d2fSBarry Smith   PetscFunctionReturn(0);
242f1af5d2fSBarry Smith }
243f1af5d2fSBarry Smith 
2444a2ae208SSatish Balay #undef __FUNCT__
2454a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
246dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
247f1af5d2fSBarry Smith {
248f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
249dfbe8321SBarry Smith   PetscErrorCode ierr;
250690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
251690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
252f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25487828ca2SBarry Smith   PetscScalar    *x,*b;
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith   PetscFunctionBegin;
257ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2581ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2591ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260f1af5d2fSBarry Smith 
261f1af5d2fSBarry Smith   /* forward solve the U^T */
262f1af5d2fSBarry Smith   idx = 0;
263f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
266f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
267ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
268f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
269f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
270f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
271f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
272f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
273f1af5d2fSBarry Smith     v += 25;
274f1af5d2fSBarry Smith 
275f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
276f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
277f1af5d2fSBarry Smith     while (nz--) {
278f1af5d2fSBarry Smith       oidx = 5*(*vi++);
279f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
280f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
281f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
282f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
283f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
284f1af5d2fSBarry Smith       v  += 25;
285f1af5d2fSBarry Smith     }
286f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
287f1af5d2fSBarry Smith     idx += 5;
288f1af5d2fSBarry Smith   }
289f1af5d2fSBarry Smith   /* backward solve the L^T */
290f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
291f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
292f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
293f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
294f1af5d2fSBarry Smith     idt  = 5*i;
295f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
296f1af5d2fSBarry Smith     while (nz--) {
297f1af5d2fSBarry Smith       idx   = 5*(*vi--);
298f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
299f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
300f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
301f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
302f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
303f1af5d2fSBarry Smith       v -= 25;
304f1af5d2fSBarry Smith     }
305f1af5d2fSBarry Smith   }
3061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
308dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
309f1af5d2fSBarry Smith   PetscFunctionReturn(0);
310f1af5d2fSBarry Smith }
311f1af5d2fSBarry Smith 
3124a2ae208SSatish Balay #undef __FUNCT__
3134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
314dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315f1af5d2fSBarry Smith {
316f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
317dfbe8321SBarry Smith   PetscErrorCode ierr;
318690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
319690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
320f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
32187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32287828ca2SBarry Smith   PetscScalar    *x,*b;
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith   PetscFunctionBegin;
325ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
328f1af5d2fSBarry Smith 
329f1af5d2fSBarry Smith   /* forward solve the U^T */
330f1af5d2fSBarry Smith   idx = 0;
331f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
332f1af5d2fSBarry Smith 
333f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
334f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
335ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
336ef66eb69SBarry Smith     x6    = x[5+idx];
337f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
338f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
339f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
340f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
341f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
342f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
343f1af5d2fSBarry Smith     v += 36;
344f1af5d2fSBarry Smith 
345f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
346f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
347f1af5d2fSBarry Smith     while (nz--) {
348f1af5d2fSBarry Smith       oidx = 6*(*vi++);
349f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
350f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
351f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
352f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
353f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
354f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
355f1af5d2fSBarry Smith       v  += 36;
356f1af5d2fSBarry Smith     }
357f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
358f1af5d2fSBarry Smith     x[5+idx] = s6;
359f1af5d2fSBarry Smith     idx += 6;
360f1af5d2fSBarry Smith   }
361f1af5d2fSBarry Smith   /* backward solve the L^T */
362f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
363f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
364f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
365f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
366f1af5d2fSBarry Smith     idt  = 6*i;
367f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
368f1af5d2fSBarry Smith     s6 = x[5+idt];
369f1af5d2fSBarry Smith     while (nz--) {
370f1af5d2fSBarry Smith       idx   = 6*(*vi--);
371f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
372f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
373f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
374f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
375f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
376f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
377f1af5d2fSBarry Smith       v -= 36;
378f1af5d2fSBarry Smith     }
379f1af5d2fSBarry Smith   }
3801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
382dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
383f1af5d2fSBarry Smith   PetscFunctionReturn(0);
384f1af5d2fSBarry Smith }
385f1af5d2fSBarry Smith 
3864a2ae208SSatish Balay #undef __FUNCT__
3874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
388dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
389f1af5d2fSBarry Smith {
390f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
391dfbe8321SBarry Smith   PetscErrorCode ierr;
392690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
393690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
394f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39687828ca2SBarry Smith   PetscScalar    *x,*b;
397f1af5d2fSBarry Smith 
398f1af5d2fSBarry Smith   PetscFunctionBegin;
399ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4001ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
402f1af5d2fSBarry Smith 
403f1af5d2fSBarry Smith   /* forward solve the U^T */
404f1af5d2fSBarry Smith   idx = 0;
405f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
406f1af5d2fSBarry Smith 
407f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
408f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
409ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
410ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
411f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
412f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
413f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
414f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
415f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
416f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
417f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
418f1af5d2fSBarry Smith     v += 49;
419f1af5d2fSBarry Smith 
420f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
421f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
422f1af5d2fSBarry Smith     while (nz--) {
423f1af5d2fSBarry Smith       oidx = 7*(*vi++);
424f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
425f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
426f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
427f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
428f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
429f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
430f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
431f1af5d2fSBarry Smith       v  += 49;
432f1af5d2fSBarry Smith     }
433f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
434f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
435f1af5d2fSBarry Smith     idx += 7;
436f1af5d2fSBarry Smith   }
437f1af5d2fSBarry Smith   /* backward solve the L^T */
438f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
439f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
440f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
441f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
442f1af5d2fSBarry Smith     idt  = 7*i;
443f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
444f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
445f1af5d2fSBarry Smith     while (nz--) {
446f1af5d2fSBarry Smith       idx   = 7*(*vi--);
447f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
448f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
449f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
450f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
451f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
452f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
453f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
454f1af5d2fSBarry Smith       v -= 49;
455f1af5d2fSBarry Smith     }
456f1af5d2fSBarry Smith   }
4571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
459dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
460f1af5d2fSBarry Smith   PetscFunctionReturn(0);
461f1af5d2fSBarry Smith }
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4644a2ae208SSatish Balay #undef __FUNCT__
4654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
466dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
467f1af5d2fSBarry Smith {
468f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
469f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4706849ba73SBarry Smith   PetscErrorCode ierr;
4715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
473690b6cddSBarry Smith   PetscInt       *diag = a->diag;
474f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47587828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
476f1af5d2fSBarry Smith 
477f1af5d2fSBarry Smith   PetscFunctionBegin;
4781ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
480f1af5d2fSBarry Smith   t  = a->solve_work;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
483f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
484f1af5d2fSBarry Smith 
485f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
486f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
487f1af5d2fSBarry Smith     t[i] = b[c[i]];
488f1af5d2fSBarry Smith   }
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith   /* forward solve the U^T */
491f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
492f1af5d2fSBarry Smith 
493f1af5d2fSBarry Smith     v     = aa + diag[i];
494f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
495f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
498f1af5d2fSBarry Smith     while (nz--) {
499f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
500f1af5d2fSBarry Smith     }
501f1af5d2fSBarry Smith     t[i]   = s1;
502f1af5d2fSBarry Smith   }
503f1af5d2fSBarry Smith   /* backward solve the L^T */
504f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
505f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
506f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
507f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
508f1af5d2fSBarry Smith     s1   = t[i];
509f1af5d2fSBarry Smith     while (nz--) {
510f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
513f1af5d2fSBarry Smith 
514f1af5d2fSBarry Smith   /* copy t into x according to permutation */
515f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
516f1af5d2fSBarry Smith     x[r[i]]   = t[i];
517f1af5d2fSBarry Smith   }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
520f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5211ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5221ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
523dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
524f1af5d2fSBarry Smith   PetscFunctionReturn(0);
525f1af5d2fSBarry Smith }
526f1af5d2fSBarry Smith 
5274a2ae208SSatish Balay #undef __FUNCT__
5284a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
529dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
530f1af5d2fSBarry Smith {
531f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
532f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5336849ba73SBarry Smith   PetscErrorCode ierr;
5345d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
536690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
537f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53887828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   PetscFunctionBegin;
5421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
544f1af5d2fSBarry Smith   t  = a->solve_work;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
547f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
550f1af5d2fSBarry Smith   ii = 0;
551f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
552f1af5d2fSBarry Smith     ic      = 2*c[i];
553f1af5d2fSBarry Smith     t[ii]   = b[ic];
554f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
555f1af5d2fSBarry Smith     ii += 2;
556f1af5d2fSBarry Smith   }
557f1af5d2fSBarry Smith 
558f1af5d2fSBarry Smith   /* forward solve the U^T */
559f1af5d2fSBarry Smith   idx = 0;
560f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
561f1af5d2fSBarry Smith 
562f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
563f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
564f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
565f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
566f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
567f1af5d2fSBarry Smith     v += 4;
568f1af5d2fSBarry Smith 
569f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
570f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
571f1af5d2fSBarry Smith     while (nz--) {
572f1af5d2fSBarry Smith       oidx = 2*(*vi++);
573f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
574f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
575f1af5d2fSBarry Smith       v  += 4;
576f1af5d2fSBarry Smith     }
577f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
578f1af5d2fSBarry Smith     idx += 2;
579f1af5d2fSBarry Smith   }
580f1af5d2fSBarry Smith   /* backward solve the L^T */
581f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
582f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
583f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
584f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
585f1af5d2fSBarry Smith     idt  = 2*i;
586f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
587f1af5d2fSBarry Smith     while (nz--) {
588f1af5d2fSBarry Smith       idx   = 2*(*vi--);
589f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
590f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
591f1af5d2fSBarry Smith       v -= 4;
592f1af5d2fSBarry Smith     }
593f1af5d2fSBarry Smith   }
594f1af5d2fSBarry Smith 
595f1af5d2fSBarry Smith   /* copy t into x according to permutation */
596f1af5d2fSBarry Smith   ii = 0;
597f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
598f1af5d2fSBarry Smith     ir      = 2*r[i];
599f1af5d2fSBarry Smith     x[ir]   = t[ii];
600f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
601f1af5d2fSBarry Smith     ii += 2;
602f1af5d2fSBarry Smith   }
603f1af5d2fSBarry Smith 
604f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
605f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
608dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
609f1af5d2fSBarry Smith   PetscFunctionReturn(0);
610f1af5d2fSBarry Smith }
611f1af5d2fSBarry Smith 
6124a2ae208SSatish Balay #undef __FUNCT__
6134a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
614dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
615f1af5d2fSBarry Smith {
616f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
617f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6186849ba73SBarry Smith   PetscErrorCode ierr;
6195d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
621690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
622f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
625f1af5d2fSBarry Smith 
626f1af5d2fSBarry Smith   PetscFunctionBegin;
6271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
629f1af5d2fSBarry Smith   t  = a->solve_work;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
632f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
633f1af5d2fSBarry Smith 
634f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
635f1af5d2fSBarry Smith   ii = 0;
636f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
637f1af5d2fSBarry Smith     ic      = 3*c[i];
638f1af5d2fSBarry Smith     t[ii]   = b[ic];
639f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
640f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
641f1af5d2fSBarry Smith     ii += 3;
642f1af5d2fSBarry Smith   }
643f1af5d2fSBarry Smith 
644f1af5d2fSBarry Smith   /* forward solve the U^T */
645f1af5d2fSBarry Smith   idx = 0;
646f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
647f1af5d2fSBarry Smith 
648f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
649f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
650f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
651f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
652f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
653f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
654f1af5d2fSBarry Smith     v += 9;
655f1af5d2fSBarry Smith 
656f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
657f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
658f1af5d2fSBarry Smith     while (nz--) {
659f1af5d2fSBarry Smith       oidx = 3*(*vi++);
660f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
661f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
662f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
663f1af5d2fSBarry Smith       v  += 9;
664f1af5d2fSBarry Smith     }
665f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
666f1af5d2fSBarry Smith     idx += 3;
667f1af5d2fSBarry Smith   }
668f1af5d2fSBarry Smith   /* backward solve the L^T */
669f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
670f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
671f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
672f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
673f1af5d2fSBarry Smith     idt  = 3*i;
674f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
675f1af5d2fSBarry Smith     while (nz--) {
676f1af5d2fSBarry Smith       idx   = 3*(*vi--);
677f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
678f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
679f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
680f1af5d2fSBarry Smith       v -= 9;
681f1af5d2fSBarry Smith     }
682f1af5d2fSBarry Smith   }
683f1af5d2fSBarry Smith 
684f1af5d2fSBarry Smith   /* copy t into x according to permutation */
685f1af5d2fSBarry Smith   ii = 0;
686f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
687f1af5d2fSBarry Smith     ir      = 3*r[i];
688f1af5d2fSBarry Smith     x[ir]   = t[ii];
689f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
690f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
691f1af5d2fSBarry Smith     ii += 3;
692f1af5d2fSBarry Smith   }
693f1af5d2fSBarry Smith 
694f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
695f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
698dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
699f1af5d2fSBarry Smith   PetscFunctionReturn(0);
700f1af5d2fSBarry Smith }
701f1af5d2fSBarry Smith 
7024a2ae208SSatish Balay #undef __FUNCT__
7034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
704dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
705f1af5d2fSBarry Smith {
706f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
707f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7086849ba73SBarry Smith   PetscErrorCode ierr;
7095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
711690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
712f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
715f1af5d2fSBarry Smith 
716f1af5d2fSBarry Smith   PetscFunctionBegin;
7171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
719f1af5d2fSBarry Smith   t  = a->solve_work;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
722f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
723f1af5d2fSBarry Smith 
724f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
725f1af5d2fSBarry Smith   ii = 0;
726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
727f1af5d2fSBarry Smith     ic      = 4*c[i];
728f1af5d2fSBarry Smith     t[ii]   = b[ic];
729f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
730f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
731f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
732f1af5d2fSBarry Smith     ii += 4;
733f1af5d2fSBarry Smith   }
734f1af5d2fSBarry Smith 
735f1af5d2fSBarry Smith   /* forward solve the U^T */
736f1af5d2fSBarry Smith   idx = 0;
737f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
738f1af5d2fSBarry Smith 
739f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
740f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
741f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
742f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
743f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
744f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
745f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
746f1af5d2fSBarry Smith     v += 16;
747f1af5d2fSBarry Smith 
748f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
749f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
750f1af5d2fSBarry Smith     while (nz--) {
751f1af5d2fSBarry Smith       oidx = 4*(*vi++);
752f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
753f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
754f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
755f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
756f1af5d2fSBarry Smith       v  += 16;
757f1af5d2fSBarry Smith     }
758f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
759f1af5d2fSBarry Smith     idx += 4;
760f1af5d2fSBarry Smith   }
761f1af5d2fSBarry Smith   /* backward solve the L^T */
762f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
763f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
764f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
765f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
766f1af5d2fSBarry Smith     idt  = 4*i;
767f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
768f1af5d2fSBarry Smith     while (nz--) {
769f1af5d2fSBarry Smith       idx   = 4*(*vi--);
770f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
771f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
772f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
773f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
774f1af5d2fSBarry Smith       v -= 16;
775f1af5d2fSBarry Smith     }
776f1af5d2fSBarry Smith   }
777f1af5d2fSBarry Smith 
778f1af5d2fSBarry Smith   /* copy t into x according to permutation */
779f1af5d2fSBarry Smith   ii = 0;
780f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
781f1af5d2fSBarry Smith     ir      = 4*r[i];
782f1af5d2fSBarry Smith     x[ir]   = t[ii];
783f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
784f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
785f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
786f1af5d2fSBarry Smith     ii += 4;
787f1af5d2fSBarry Smith   }
788f1af5d2fSBarry Smith 
789f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
790f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
794f1af5d2fSBarry Smith   PetscFunctionReturn(0);
795f1af5d2fSBarry Smith }
796f1af5d2fSBarry Smith 
7974a2ae208SSatish Balay #undef __FUNCT__
7984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
799dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
800f1af5d2fSBarry Smith {
801f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
802f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8036849ba73SBarry Smith   PetscErrorCode ierr;
8045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
806690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
807f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
810f1af5d2fSBarry Smith 
811f1af5d2fSBarry Smith   PetscFunctionBegin;
8121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
814f1af5d2fSBarry Smith   t  = a->solve_work;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
817f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
818f1af5d2fSBarry Smith 
819f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
820f1af5d2fSBarry Smith   ii = 0;
821f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
822f1af5d2fSBarry Smith     ic      = 5*c[i];
823f1af5d2fSBarry Smith     t[ii]   = b[ic];
824f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
825f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
826f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
827f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
828f1af5d2fSBarry Smith     ii += 5;
829f1af5d2fSBarry Smith   }
830f1af5d2fSBarry Smith 
831f1af5d2fSBarry Smith   /* forward solve the U^T */
832f1af5d2fSBarry Smith   idx = 0;
833f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
834f1af5d2fSBarry Smith 
835f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
836f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
837f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
838f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
839f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
840f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
841f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
842f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
843f1af5d2fSBarry Smith     v += 25;
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
846f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
847f1af5d2fSBarry Smith     while (nz--) {
848f1af5d2fSBarry Smith       oidx = 5*(*vi++);
849f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
850f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
851f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
852f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
853f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
854f1af5d2fSBarry Smith       v  += 25;
855f1af5d2fSBarry Smith     }
856f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
857f1af5d2fSBarry Smith     idx += 5;
858f1af5d2fSBarry Smith   }
859f1af5d2fSBarry Smith   /* backward solve the L^T */
860f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
861f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
862f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
863f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
864f1af5d2fSBarry Smith     idt  = 5*i;
865f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
866f1af5d2fSBarry Smith     while (nz--) {
867f1af5d2fSBarry Smith       idx   = 5*(*vi--);
868f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
869f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
870f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
871f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
872f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
873f1af5d2fSBarry Smith       v -= 25;
874f1af5d2fSBarry Smith     }
875f1af5d2fSBarry Smith   }
876f1af5d2fSBarry Smith 
877f1af5d2fSBarry Smith   /* copy t into x according to permutation */
878f1af5d2fSBarry Smith   ii = 0;
879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
880f1af5d2fSBarry Smith     ir      = 5*r[i];
881f1af5d2fSBarry Smith     x[ir]   = t[ii];
882f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
883f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
884f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
885f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
886f1af5d2fSBarry Smith     ii += 5;
887f1af5d2fSBarry Smith   }
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
890f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8911ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
893dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
894f1af5d2fSBarry Smith   PetscFunctionReturn(0);
895f1af5d2fSBarry Smith }
896f1af5d2fSBarry Smith 
8974a2ae208SSatish Balay #undef __FUNCT__
8984a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
899dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
900f1af5d2fSBarry Smith {
901f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
902f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9036849ba73SBarry Smith   PetscErrorCode ierr;
9045d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9055d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
906690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
907f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
910f1af5d2fSBarry Smith 
911f1af5d2fSBarry Smith   PetscFunctionBegin;
9121ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
914f1af5d2fSBarry Smith   t  = a->solve_work;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
917f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
920f1af5d2fSBarry Smith   ii = 0;
921f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
922f1af5d2fSBarry Smith     ic      = 6*c[i];
923f1af5d2fSBarry Smith     t[ii]   = b[ic];
924f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
925f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
926f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
927f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
928f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
929f1af5d2fSBarry Smith     ii += 6;
930f1af5d2fSBarry Smith   }
931f1af5d2fSBarry Smith 
932f1af5d2fSBarry Smith   /* forward solve the U^T */
933f1af5d2fSBarry Smith   idx = 0;
934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
937f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
938f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
939f1af5d2fSBarry Smith     x6    = t[5+idx];
940f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
941f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
942f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
943f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
944f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
945f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
946f1af5d2fSBarry Smith     v += 36;
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
949f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
950f1af5d2fSBarry Smith     while (nz--) {
951f1af5d2fSBarry Smith       oidx = 6*(*vi++);
952f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
953f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
954f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
955f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
956f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
957f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
958f1af5d2fSBarry Smith       v  += 36;
959f1af5d2fSBarry Smith     }
960f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
961f1af5d2fSBarry Smith     t[5+idx] = s6;
962f1af5d2fSBarry Smith     idx += 6;
963f1af5d2fSBarry Smith   }
964f1af5d2fSBarry Smith   /* backward solve the L^T */
965f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
966f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
967f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
968f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
969f1af5d2fSBarry Smith     idt  = 6*i;
970f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
971f1af5d2fSBarry Smith     s6 = t[5+idt];
972f1af5d2fSBarry Smith     while (nz--) {
973f1af5d2fSBarry Smith       idx   = 6*(*vi--);
974f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
975f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
976f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
977f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
978f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
979f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
980f1af5d2fSBarry Smith       v -= 36;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith   }
983f1af5d2fSBarry Smith 
984f1af5d2fSBarry Smith   /* copy t into x according to permutation */
985f1af5d2fSBarry Smith   ii = 0;
986f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
987f1af5d2fSBarry Smith     ir      = 6*r[i];
988f1af5d2fSBarry Smith     x[ir]   = t[ii];
989f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
990f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
991f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
992f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
993f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
994f1af5d2fSBarry Smith     ii += 6;
995f1af5d2fSBarry Smith   }
996f1af5d2fSBarry Smith 
997f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
998f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1001dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1002f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1003f1af5d2fSBarry Smith }
1004f1af5d2fSBarry Smith 
10054a2ae208SSatish Balay #undef __FUNCT__
10064a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1007dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1008f1af5d2fSBarry Smith {
1009f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1010f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10116849ba73SBarry Smith   PetscErrorCode ierr;
10125d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1014690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1015f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1018f1af5d2fSBarry Smith 
1019f1af5d2fSBarry Smith   PetscFunctionBegin;
10201ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1022f1af5d2fSBarry Smith   t  = a->solve_work;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1025f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1026f1af5d2fSBarry Smith 
1027f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1028f1af5d2fSBarry Smith   ii = 0;
1029f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1030f1af5d2fSBarry Smith     ic      = 7*c[i];
1031f1af5d2fSBarry Smith     t[ii]   = b[ic];
1032f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1033f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1034f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1035f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1036f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1037f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1038f1af5d2fSBarry Smith     ii += 7;
1039f1af5d2fSBarry Smith   }
1040f1af5d2fSBarry Smith 
1041f1af5d2fSBarry Smith   /* forward solve the U^T */
1042f1af5d2fSBarry Smith   idx = 0;
1043f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1044f1af5d2fSBarry Smith 
1045f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1046f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1047f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1048f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1049f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1050f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1051f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1052f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1053f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1054f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1055f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1056f1af5d2fSBarry Smith     v += 49;
1057f1af5d2fSBarry Smith 
1058f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1059f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1060f1af5d2fSBarry Smith     while (nz--) {
1061f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1062f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1063f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1064f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1065f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1066f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1067f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1068f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1069f1af5d2fSBarry Smith       v  += 49;
1070f1af5d2fSBarry Smith     }
1071f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1072f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1073f1af5d2fSBarry Smith     idx += 7;
1074f1af5d2fSBarry Smith   }
1075f1af5d2fSBarry Smith   /* backward solve the L^T */
1076f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1077f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1078f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1079f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1080f1af5d2fSBarry Smith     idt  = 7*i;
1081f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1082f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1083f1af5d2fSBarry Smith     while (nz--) {
1084f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1085f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1086f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1087f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1088f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1089f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1090f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1091f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1092f1af5d2fSBarry Smith       v -= 49;
1093f1af5d2fSBarry Smith     }
1094f1af5d2fSBarry Smith   }
1095f1af5d2fSBarry Smith 
1096f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1097f1af5d2fSBarry Smith   ii = 0;
1098f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1099f1af5d2fSBarry Smith     ir      = 7*r[i];
1100f1af5d2fSBarry Smith     x[ir]   = t[ii];
1101f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1102f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1103f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1104f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1105f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1106f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1107f1af5d2fSBarry Smith     ii += 7;
1108f1af5d2fSBarry Smith   }
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1111f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1114dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1115f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1116f1af5d2fSBarry Smith }
1117f1af5d2fSBarry Smith 
11184e2b4712SSatish Balay /* ----------------------------------------------------------- */
11194a2ae208SSatish Balay #undef __FUNCT__
11204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1121dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11224e2b4712SSatish Balay {
11234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11256849ba73SBarry Smith   PetscErrorCode ierr;
11265d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11285d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11293f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
113087828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11314e2b4712SSatish Balay 
11324e2b4712SSatish Balay   PetscFunctionBegin;
11331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1135f1af5d2fSBarry Smith   t  = a->solve_work;
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11394e2b4712SSatish Balay 
11404e2b4712SSatish Balay   /* forward solve the lower triangular */
114187828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11424e2b4712SSatish Balay   for (i=1; i<n; i++) {
11434e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11444e2b4712SSatish Balay     vi  = aj + ai[i];
11454e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1146f1af5d2fSBarry Smith     s = t + bs*i;
114787828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11484e2b4712SSatish Balay     while (nz--) {
1149f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11504e2b4712SSatish Balay       v += bs2;
11514e2b4712SSatish Balay     }
11524e2b4712SSatish Balay   }
11534e2b4712SSatish Balay   /* backward solve the upper triangular */
1154d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11564e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11574e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11584e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115987828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11604e2b4712SSatish Balay     while (nz--) {
1161f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11624e2b4712SSatish Balay       v += bs2;
11634e2b4712SSatish Balay     }
1164f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116587828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11664e2b4712SSatish Balay   }
11674e2b4712SSatish Balay 
11684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11734e2b4712SSatish Balay   PetscFunctionReturn(0);
11744e2b4712SSatish Balay }
11754e2b4712SSatish Balay 
11764a2ae208SSatish Balay #undef __FUNCT__
11774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1178dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11794e2b4712SSatish Balay {
11804e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11814e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11826849ba73SBarry Smith   PetscErrorCode ierr;
11835d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11845d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11853f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11884e2b4712SSatish Balay 
11894e2b4712SSatish Balay   PetscFunctionBegin;
11901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1192f1af5d2fSBarry Smith   t  = a->solve_work;
11934e2b4712SSatish Balay 
11944e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11954e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11964e2b4712SSatish Balay 
11974e2b4712SSatish Balay   /* forward solve the lower triangular */
11984e2b4712SSatish Balay   idx    = 7*(*r++);
1199f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1200f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1201f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12024e2b4712SSatish Balay 
12034e2b4712SSatish Balay   for (i=1; i<n; i++) {
12044e2b4712SSatish Balay     v     = aa + 49*ai[i];
12054e2b4712SSatish Balay     vi    = aj + ai[i];
12064e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12074e2b4712SSatish Balay     idx   = 7*(*r++);
1208f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1209f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12104e2b4712SSatish Balay     while (nz--) {
12114e2b4712SSatish Balay       idx   = 7*(*vi++);
1212f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1213f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1214f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1215f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1216f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1217f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1218f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1219f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1220f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1221f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12224e2b4712SSatish Balay       v += 49;
12234e2b4712SSatish Balay     }
12244e2b4712SSatish Balay     idx = 7*i;
1225f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1226f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1227f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12284e2b4712SSatish Balay   }
12294e2b4712SSatish Balay   /* backward solve the upper triangular */
12304e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12314e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12324e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12334e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12344e2b4712SSatish Balay     idt  = 7*i;
1235f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1236f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1237f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12384e2b4712SSatish Balay     while (nz--) {
12394e2b4712SSatish Balay       idx   = 7*(*vi++);
1240f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1241f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1243f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1244f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1245f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1246f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1247f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1248f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1249f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12504e2b4712SSatish Balay       v += 49;
12514e2b4712SSatish Balay     }
12524e2b4712SSatish Balay     idc = 7*(*c--);
12534e2b4712SSatish Balay     v   = aa + 49*diag[i];
1254f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1255f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1256f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1257f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1258f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1259f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1260f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1261f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1262f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1263f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1264f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1265f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1266f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1267f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12684e2b4712SSatish Balay   }
12694e2b4712SSatish Balay 
12704e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12714e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12721ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12731ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1274dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12754e2b4712SSatish Balay   PetscFunctionReturn(0);
12764e2b4712SSatish Balay }
12774e2b4712SSatish Balay 
12784a2ae208SSatish Balay #undef __FUNCT__
12798f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
12808f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
12818f690400SShri Abhyankar {
12828f690400SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
12838f690400SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
12848f690400SShri Abhyankar   PetscErrorCode ierr;
12858f690400SShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*vi;
128629b92fc1SShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,k,m;
12878f690400SShri Abhyankar   MatScalar      *aa=a->a,*v;
12888f690400SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
12898f690400SShri Abhyankar   PetscScalar    *x,*b,*t;
12908f690400SShri Abhyankar 
12918f690400SShri Abhyankar   PetscFunctionBegin;
12928f690400SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12938f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
12948f690400SShri Abhyankar   t  = a->solve_work;
12958f690400SShri Abhyankar 
12968f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
129729b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
12988f690400SShri Abhyankar 
12998f690400SShri Abhyankar   /* forward solve the lower triangular */
130029b92fc1SShri Abhyankar   idx    = 7*r[0];
13018f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
13028f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
13038f690400SShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
13048f690400SShri Abhyankar 
13058f690400SShri Abhyankar   for (i=1; i<n; i++) {
13068f690400SShri Abhyankar     v     = aa + 49*ai[i];
13078f690400SShri Abhyankar     vi    = aj + ai[i];
13088f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
130929b92fc1SShri Abhyankar     idx   = 7*r[i];
13108f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
13118f690400SShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
131229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
131329b92fc1SShri Abhyankar       idx   = 7*vi[m];
13148f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
13158f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
13168f690400SShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
13178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13228f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13238f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13248f690400SShri Abhyankar       v += 49;
13258f690400SShri Abhyankar     }
13268f690400SShri Abhyankar     idx = 7*i;
13278f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
13288f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
13298f690400SShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
13308f690400SShri Abhyankar   }
13318f690400SShri Abhyankar   /* backward solve the upper triangular */
13328f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
13338f690400SShri Abhyankar     k    = 2*n-i;
13348f690400SShri Abhyankar     v    = aa + 49*ai[k];
13358f690400SShri Abhyankar     vi   = aj + ai[k];
13368f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
13378f690400SShri Abhyankar     idt  = 7*i;
13388f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
13398f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
13408f690400SShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
134129b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
134229b92fc1SShri Abhyankar       idx   = 7*vi[m];
13438f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
13448f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
13458f690400SShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
13468f690400SShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
13478f690400SShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
13488f690400SShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
13498f690400SShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
13508f690400SShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
13518f690400SShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
13528f690400SShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13538f690400SShri Abhyankar       v += 49;
13548f690400SShri Abhyankar     }
135529b92fc1SShri Abhyankar     idc = 7*c[i];
13568f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
13578f690400SShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
13588f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
13598f690400SShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
13608f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
13618f690400SShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
13628f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
13638f690400SShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
13648f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
13658f690400SShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
13668f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
13678f690400SShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
13688f690400SShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
13698f690400SShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13708f690400SShri Abhyankar   }
13718f690400SShri Abhyankar 
13728f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
13738f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13748f690400SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13758f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
13768f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
13778f690400SShri Abhyankar   PetscFunctionReturn(0);
13788f690400SShri Abhyankar }
13798f690400SShri Abhyankar 
13808f690400SShri Abhyankar #undef __FUNCT__
13814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1382dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
138315091d37SBarry Smith {
138415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1385690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1386dfbe8321SBarry Smith   PetscErrorCode    ierr;
1387690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1388d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1389d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1390d9fead3dSBarry Smith   const PetscScalar *b;
139115091d37SBarry Smith 
139215091d37SBarry Smith   PetscFunctionBegin;
1393d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
139515091d37SBarry Smith   /* forward solve the lower triangular */
139615091d37SBarry Smith   idx    = 0;
139715091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
139815091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
139915091d37SBarry Smith   x[6] = b[6+idx];
140015091d37SBarry Smith   for (i=1; i<n; i++) {
140115091d37SBarry Smith     v     =  aa + 49*ai[i];
140215091d37SBarry Smith     vi    =  aj + ai[i];
140315091d37SBarry Smith     nz    =  diag[i] - ai[i];
140415091d37SBarry Smith     idx   =  7*i;
1405f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1406f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1407f1af5d2fSBarry Smith     s7  =  b[6+idx];
140815091d37SBarry Smith     while (nz--) {
140915091d37SBarry Smith       jdx   = 7*(*vi++);
141015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
141115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
141215091d37SBarry Smith       x7    = x[6+jdx];
1413f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1414f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1415f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1416f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1417f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1418f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1419f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
142015091d37SBarry Smith       v += 49;
142115091d37SBarry Smith      }
1422f1af5d2fSBarry Smith     x[idx]   = s1;
1423f1af5d2fSBarry Smith     x[1+idx] = s2;
1424f1af5d2fSBarry Smith     x[2+idx] = s3;
1425f1af5d2fSBarry Smith     x[3+idx] = s4;
1426f1af5d2fSBarry Smith     x[4+idx] = s5;
1427f1af5d2fSBarry Smith     x[5+idx] = s6;
1428f1af5d2fSBarry Smith     x[6+idx] = s7;
142915091d37SBarry Smith   }
143015091d37SBarry Smith   /* backward solve the upper triangular */
143115091d37SBarry Smith   for (i=n-1; i>=0; i--){
143215091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
143315091d37SBarry Smith     vi   = aj + diag[i] + 1;
143415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
143515091d37SBarry Smith     idt  = 7*i;
1436f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1437f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1438f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1439f1af5d2fSBarry Smith     s7 = x[6+idt];
144015091d37SBarry Smith     while (nz--) {
144115091d37SBarry Smith       idx   = 7*(*vi++);
144215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
144315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
144415091d37SBarry Smith       x7    = x[6+idx];
1445f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1446f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1447f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1448f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1449f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1450f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1451f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
145215091d37SBarry Smith       v += 49;
145315091d37SBarry Smith     }
145415091d37SBarry Smith     v        = aa + 49*diag[i];
1455f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1456f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1457f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1458f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1459f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1460f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1461f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1462f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1463f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1464f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1465f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1466f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1467f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1468f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
146915091d37SBarry Smith   }
147015091d37SBarry Smith 
1471d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1473dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
147415091d37SBarry Smith   PetscFunctionReturn(0);
147515091d37SBarry Smith }
147615091d37SBarry Smith 
14774a2ae208SSatish Balay #undef __FUNCT__
1478cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1479cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1480cee9d6f2SShri Abhyankar {
1481cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
14826464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1483cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1484cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1485cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1486cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1487cee9d6f2SShri Abhyankar     PetscScalar       *x;
1488cee9d6f2SShri Abhyankar     const PetscScalar *b;
1489cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1490cee9d6f2SShri Abhyankar 
1491cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1492cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1493cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1494cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1495cee9d6f2SShri Abhyankar     idx    = 0;
1496cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1497cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1498cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1499cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1500cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1501cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1502cee9d6f2SShri Abhyankar       idx   = bs*i;
1503cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1504cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
15056464896eSShri Abhyankar        for(k=0;k<nz;k++) {
15066464896eSShri Abhyankar           jdx   = bs*vi[k];
1507cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1508cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1509cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1510cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1511cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1512cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1513cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1514cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1515cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1516cee9d6f2SShri Abhyankar           v   +=  bs2;
1517cee9d6f2SShri Abhyankar         }
1518cee9d6f2SShri Abhyankar 
1519cee9d6f2SShri Abhyankar        x[idx]   = s1;
1520cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1521cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1522cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1523cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1524cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1525cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1526cee9d6f2SShri Abhyankar     }
1527cee9d6f2SShri Abhyankar 
1528cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1529cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1530cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1531cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1532cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1533cee9d6f2SShri Abhyankar      idt = bs*i;
1534cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1535cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
15366464896eSShri Abhyankar     for(k=0;k<nz;k++) {
15376464896eSShri Abhyankar       idx   = bs*vi[k];
1538cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1539cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1540cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1541cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1542cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1543cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1544cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1545cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1546cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1547cee9d6f2SShri Abhyankar         v   +=  bs2;
1548cee9d6f2SShri Abhyankar     }
1549cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1550cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1551cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1552cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1553cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1554cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1555cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1556cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1557cee9d6f2SShri Abhyankar   }
1558cee9d6f2SShri Abhyankar 
1559cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1560cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1561cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1562cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1563cee9d6f2SShri Abhyankar }
1564cee9d6f2SShri Abhyankar 
1565cee9d6f2SShri Abhyankar #undef __FUNCT__
156653cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2"
156753cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
156853cca76cSShri Abhyankar {
156953cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
157053cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
157153cca76cSShri Abhyankar     PetscErrorCode    ierr;
157253cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
157353cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
157453cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
157553cca76cSShri Abhyankar     PetscScalar       *x;
157653cca76cSShri Abhyankar     const PetscScalar *b;
157753cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
157853cca76cSShri Abhyankar 
157953cca76cSShri Abhyankar     PetscFunctionBegin;
158053cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
158153cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
158253cca76cSShri Abhyankar     /* forward solve the lower triangular */
158353cca76cSShri Abhyankar     idx    = 0;
158453cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
158553cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
158653cca76cSShri Abhyankar     for (i=1; i<n; i++) {
158753cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
158853cca76cSShri Abhyankar        vi   = aj + ai[i];
158953cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
159053cca76cSShri Abhyankar       idx   = bs*i;
159153cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
159253cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
159353cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
159453cca76cSShri Abhyankar           jdx   = bs*vi[k];
159553cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
159653cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
159753cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
159853cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
159953cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
160053cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
160153cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
160253cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
160353cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
160453cca76cSShri Abhyankar           v   +=  bs2;
160553cca76cSShri Abhyankar         }
160653cca76cSShri Abhyankar 
160753cca76cSShri Abhyankar        x[idx]   = s1;
160853cca76cSShri Abhyankar        x[1+idx] = s2;
160953cca76cSShri Abhyankar        x[2+idx] = s3;
161053cca76cSShri Abhyankar        x[3+idx] = s4;
161153cca76cSShri Abhyankar        x[4+idx] = s5;
161253cca76cSShri Abhyankar        x[5+idx] = s6;
161353cca76cSShri Abhyankar        x[6+idx] = s7;
161453cca76cSShri Abhyankar     }
161553cca76cSShri Abhyankar 
161653cca76cSShri Abhyankar    /* backward solve the upper triangular */
161753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
161853cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
161953cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
162053cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
162153cca76cSShri Abhyankar      idt = bs*i;
162253cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
162353cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
162453cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
162553cca76cSShri Abhyankar       idx   = bs*vi[k];
162653cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
162753cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
162853cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
162953cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
163053cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
163153cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
163253cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
163353cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
163453cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
163553cca76cSShri Abhyankar         v   +=  bs2;
163653cca76cSShri Abhyankar     }
163753cca76cSShri Abhyankar     /* x = inv_diagonal*x */
163853cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
163953cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
164053cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
164153cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
164253cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
164353cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
164453cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
164553cca76cSShri Abhyankar   }
164653cca76cSShri Abhyankar 
164753cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
164853cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
164953cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
165053cca76cSShri Abhyankar   PetscFunctionReturn(0);
165153cca76cSShri Abhyankar }
165253cca76cSShri Abhyankar 
165353cca76cSShri Abhyankar #undef __FUNCT__
16544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1655dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
165615091d37SBarry Smith {
165715091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
165815091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
16596849ba73SBarry Smith   PetscErrorCode    ierr;
16605d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
16615d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1662d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1663d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1664d9fead3dSBarry Smith   const PetscScalar *b;
166515091d37SBarry Smith   PetscFunctionBegin;
1666d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16671ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1668f1af5d2fSBarry Smith   t  = a->solve_work;
166915091d37SBarry Smith 
167015091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
167115091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
167215091d37SBarry Smith 
167315091d37SBarry Smith   /* forward solve the lower triangular */
167415091d37SBarry Smith   idx    = 6*(*r++);
1675f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1676f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1677f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
167815091d37SBarry Smith   for (i=1; i<n; i++) {
167915091d37SBarry Smith     v     = aa + 36*ai[i];
168015091d37SBarry Smith     vi    = aj + ai[i];
168115091d37SBarry Smith     nz    = diag[i] - ai[i];
168215091d37SBarry Smith     idx   = 6*(*r++);
1683f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1684f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
168515091d37SBarry Smith     while (nz--) {
168615091d37SBarry Smith       idx   = 6*(*vi++);
1687f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1688f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1689f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1690f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1691f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1692f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1693f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1694f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
169515091d37SBarry Smith       v += 36;
169615091d37SBarry Smith     }
169715091d37SBarry Smith     idx = 6*i;
1698f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1699f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1700f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
170115091d37SBarry Smith   }
170215091d37SBarry Smith   /* backward solve the upper triangular */
170315091d37SBarry Smith   for (i=n-1; i>=0; i--){
170415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
170515091d37SBarry Smith     vi   = aj + diag[i] + 1;
170615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
170715091d37SBarry Smith     idt  = 6*i;
1708f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1709f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1710f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
171115091d37SBarry Smith     while (nz--) {
171215091d37SBarry Smith       idx   = 6*(*vi++);
1713f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1714f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1715f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1716f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1717f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1718f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1719f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1720f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1721f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
172215091d37SBarry Smith       v += 36;
172315091d37SBarry Smith     }
172415091d37SBarry Smith     idc = 6*(*c--);
172515091d37SBarry Smith     v   = aa + 36*diag[i];
1726f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1727f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1728f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1729f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1730f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1731f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1732f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1733f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1734f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1735f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1736f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1737f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
173815091d37SBarry Smith   }
173915091d37SBarry Smith 
174015091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
174115091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1742d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1744dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
174515091d37SBarry Smith   PetscFunctionReturn(0);
174615091d37SBarry Smith }
174715091d37SBarry Smith 
17484a2ae208SSatish Balay #undef __FUNCT__
17498f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
17508f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
17518f690400SShri Abhyankar {
17528f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17538f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
17548f690400SShri Abhyankar   PetscErrorCode    ierr;
17558f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
175629b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
17578f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
17588f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
17598f690400SShri Abhyankar   const PetscScalar *b;
17608f690400SShri Abhyankar   PetscFunctionBegin;
17618f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17628f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
17638f690400SShri Abhyankar   t  = a->solve_work;
17648f690400SShri Abhyankar 
17658f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
176629b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
17678f690400SShri Abhyankar 
17688f690400SShri Abhyankar   /* forward solve the lower triangular */
176929b92fc1SShri Abhyankar   idx    = 6*r[0];
17708f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
17718f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
17728f690400SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
17738f690400SShri Abhyankar   for (i=1; i<n; i++) {
17748f690400SShri Abhyankar     v     = aa + 36*ai[i];
17758f690400SShri Abhyankar     vi    = aj + ai[i];
17768f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
177729b92fc1SShri Abhyankar     idx   = 6*r[i];
17788f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17798f690400SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
178029b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
178129b92fc1SShri Abhyankar       idx   = 6*vi[m];
17828f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
17838f690400SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
17848f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
17858f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
17868f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
17878f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
17888f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
17898f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
17908f690400SShri Abhyankar       v += 36;
17918f690400SShri Abhyankar     }
17928f690400SShri Abhyankar     idx = 6*i;
17938f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
17948f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
17958f690400SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
17968f690400SShri Abhyankar   }
17978f690400SShri Abhyankar   /* backward solve the upper triangular */
17988f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
17998f690400SShri Abhyankar     k    = 2*n-i;
18008f690400SShri Abhyankar     v    = aa + 36*ai[k];
18018f690400SShri Abhyankar     vi   = aj + ai[k];
18028f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
18038f690400SShri Abhyankar     idt  = 6*i;
18048f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
18058f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
18068f690400SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
180729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
180829b92fc1SShri Abhyankar       idx   = 6*vi[m];
18098f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
18108f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
18118f690400SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
18128f690400SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
18138f690400SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
18148f690400SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
18158f690400SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
18168f690400SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
18178f690400SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
18188f690400SShri Abhyankar       v += 36;
18198f690400SShri Abhyankar     }
182029b92fc1SShri Abhyankar     idc = 6*c[i];
18218f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
18228f690400SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
18238f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
18248f690400SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
18258f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
18268f690400SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
18278f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
18288f690400SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
18298f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
18308f690400SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
18318f690400SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
18328f690400SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
18338f690400SShri Abhyankar   }
18348f690400SShri Abhyankar 
18358f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18368f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18378f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18388f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
18398f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
18408f690400SShri Abhyankar   PetscFunctionReturn(0);
18418f690400SShri Abhyankar }
18428f690400SShri Abhyankar 
18438f690400SShri Abhyankar 
18448f690400SShri Abhyankar #undef __FUNCT__
18454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1846dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
184715091d37SBarry Smith {
184815091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1849690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1850dfbe8321SBarry Smith   PetscErrorCode    ierr;
1851690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1852d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1853d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1854d9fead3dSBarry Smith   const PetscScalar *b;
185515091d37SBarry Smith 
185615091d37SBarry Smith   PetscFunctionBegin;
1857d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18581ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
185915091d37SBarry Smith   /* forward solve the lower triangular */
186015091d37SBarry Smith   idx    = 0;
186115091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
186215091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
186315091d37SBarry Smith   for (i=1; i<n; i++) {
186415091d37SBarry Smith     v     =  aa + 36*ai[i];
186515091d37SBarry Smith     vi    =  aj + ai[i];
186615091d37SBarry Smith     nz    =  diag[i] - ai[i];
186715091d37SBarry Smith     idx   =  6*i;
1868f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1869f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
187015091d37SBarry Smith     while (nz--) {
187115091d37SBarry Smith       jdx   = 6*(*vi++);
187215091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
187315091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1874f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1875f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1876f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1877f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1878f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1879f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
188015091d37SBarry Smith       v += 36;
188115091d37SBarry Smith      }
1882f1af5d2fSBarry Smith     x[idx]   = s1;
1883f1af5d2fSBarry Smith     x[1+idx] = s2;
1884f1af5d2fSBarry Smith     x[2+idx] = s3;
1885f1af5d2fSBarry Smith     x[3+idx] = s4;
1886f1af5d2fSBarry Smith     x[4+idx] = s5;
1887f1af5d2fSBarry Smith     x[5+idx] = s6;
188815091d37SBarry Smith   }
188915091d37SBarry Smith   /* backward solve the upper triangular */
189015091d37SBarry Smith   for (i=n-1; i>=0; i--){
189115091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
189215091d37SBarry Smith     vi   = aj + diag[i] + 1;
189315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
189415091d37SBarry Smith     idt  = 6*i;
1895f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1896f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1897f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
189815091d37SBarry Smith     while (nz--) {
189915091d37SBarry Smith       idx   = 6*(*vi++);
190015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
190115091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1902f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1903f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1904f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1905f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1906f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1907f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
190815091d37SBarry Smith       v += 36;
190915091d37SBarry Smith     }
191015091d37SBarry Smith     v        = aa + 36*diag[i];
1911f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1912f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1913f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1914f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1915f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1916f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
191715091d37SBarry Smith   }
191815091d37SBarry Smith 
1919d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1921dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
192215091d37SBarry Smith   PetscFunctionReturn(0);
192315091d37SBarry Smith }
192415091d37SBarry Smith 
19254a2ae208SSatish Balay #undef __FUNCT__
1926cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1927cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1928cee9d6f2SShri Abhyankar {
1929cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
19306464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1931cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1932cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1933cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1934cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1935cee9d6f2SShri Abhyankar     PetscScalar       *x;
1936cee9d6f2SShri Abhyankar     const PetscScalar *b;
1937cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1938cee9d6f2SShri Abhyankar 
1939cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1940cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1941cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1942cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1943cee9d6f2SShri Abhyankar     idx    = 0;
1944cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1945cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
1946cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1947cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1948cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1949cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1950cee9d6f2SShri Abhyankar       idx   = bs*i;
1951cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1952cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
19536464896eSShri Abhyankar        for(k=0;k<nz;k++){
19546464896eSShri Abhyankar           jdx   = bs*vi[k];
1955cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1956cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1957cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1958cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1959cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1960cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1961cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1962cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1963cee9d6f2SShri Abhyankar           v   +=  bs2;
1964cee9d6f2SShri Abhyankar         }
1965cee9d6f2SShri Abhyankar 
1966cee9d6f2SShri Abhyankar        x[idx]   = s1;
1967cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1968cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1969cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1970cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1971cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1972cee9d6f2SShri Abhyankar     }
1973cee9d6f2SShri Abhyankar 
1974cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1975cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1976cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1977cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1978cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1979cee9d6f2SShri Abhyankar      idt = bs*i;
1980cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1981cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
19826464896eSShri Abhyankar      for(k=0;k<nz;k++){
19836464896eSShri Abhyankar       idx   = bs*vi[k];
1984cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1985cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
1986cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1987cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1988cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1989cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1990cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1991cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1992cee9d6f2SShri Abhyankar         v   +=  bs2;
1993cee9d6f2SShri Abhyankar     }
1994cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1995cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1996cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1997cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1998cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1999cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2000cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
2001cee9d6f2SShri Abhyankar   }
2002cee9d6f2SShri Abhyankar 
2003cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2004cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2005cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2006cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2007cee9d6f2SShri Abhyankar }
20088f690400SShri Abhyankar 
2009cee9d6f2SShri Abhyankar #undef __FUNCT__
201053cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2"
201153cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
201253cca76cSShri Abhyankar {
201353cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
201453cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
201553cca76cSShri Abhyankar     PetscErrorCode    ierr;
201653cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
201753cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
201853cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
201953cca76cSShri Abhyankar     PetscScalar       *x;
202053cca76cSShri Abhyankar     const PetscScalar *b;
202153cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
202253cca76cSShri Abhyankar 
202353cca76cSShri Abhyankar     PetscFunctionBegin;
202453cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
202553cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
202653cca76cSShri Abhyankar     /* forward solve the lower triangular */
202753cca76cSShri Abhyankar     idx    = 0;
202853cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
202953cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
203053cca76cSShri Abhyankar     for (i=1; i<n; i++) {
203153cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
203253cca76cSShri Abhyankar        vi   = aj + ai[i];
203353cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
203453cca76cSShri Abhyankar       idx   = bs*i;
203553cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
203653cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
203753cca76cSShri Abhyankar        for(k=0;k<nz;k++){
203853cca76cSShri Abhyankar           jdx   = bs*vi[k];
203953cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
204053cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
204153cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
204253cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
204353cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
204453cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
204553cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
204653cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
204753cca76cSShri Abhyankar           v   +=  bs2;
204853cca76cSShri Abhyankar         }
204953cca76cSShri Abhyankar 
205053cca76cSShri Abhyankar        x[idx]   = s1;
205153cca76cSShri Abhyankar        x[1+idx] = s2;
205253cca76cSShri Abhyankar        x[2+idx] = s3;
205353cca76cSShri Abhyankar        x[3+idx] = s4;
205453cca76cSShri Abhyankar        x[4+idx] = s5;
205553cca76cSShri Abhyankar        x[5+idx] = s6;
205653cca76cSShri Abhyankar     }
205753cca76cSShri Abhyankar 
205853cca76cSShri Abhyankar    /* backward solve the upper triangular */
205953cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
206053cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
206153cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
206253cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
206353cca76cSShri Abhyankar      idt = bs*i;
206453cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
206553cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
206653cca76cSShri Abhyankar      for(k=0;k<nz;k++){
206753cca76cSShri Abhyankar       idx   = bs*vi[k];
206853cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
206953cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
207053cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
207153cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
207253cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
207353cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
207453cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
207553cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
207653cca76cSShri Abhyankar         v   +=  bs2;
207753cca76cSShri Abhyankar     }
207853cca76cSShri Abhyankar     /* x = inv_diagonal*x */
207953cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
208053cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
208153cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
208253cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
208353cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
208453cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
208553cca76cSShri Abhyankar   }
208653cca76cSShri Abhyankar 
208753cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
208853cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
208953cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
209053cca76cSShri Abhyankar   PetscFunctionReturn(0);
209153cca76cSShri Abhyankar }
209253cca76cSShri Abhyankar 
209353cca76cSShri Abhyankar #undef __FUNCT__
20944a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2095dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
20964e2b4712SSatish Balay {
20974e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
20984e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
20996849ba73SBarry Smith   PetscErrorCode    ierr;
21005d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
21015d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2102d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2103d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2104d9fead3dSBarry Smith   const PetscScalar *b;
21054e2b4712SSatish Balay 
21064e2b4712SSatish Balay   PetscFunctionBegin;
2107d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21081ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2109f1af5d2fSBarry Smith   t  = a->solve_work;
21104e2b4712SSatish Balay 
21114e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21124e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
21134e2b4712SSatish Balay 
21144e2b4712SSatish Balay   /* forward solve the lower triangular */
21154e2b4712SSatish Balay   idx    = 5*(*r++);
2116f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2117f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
21184e2b4712SSatish Balay   for (i=1; i<n; i++) {
21194e2b4712SSatish Balay     v     = aa + 25*ai[i];
21204e2b4712SSatish Balay     vi    = aj + ai[i];
21214e2b4712SSatish Balay     nz    = diag[i] - ai[i];
21224e2b4712SSatish Balay     idx   = 5*(*r++);
2123f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2124f1af5d2fSBarry Smith     s5  = b[4+idx];
21254e2b4712SSatish Balay     while (nz--) {
21264e2b4712SSatish Balay       idx   = 5*(*vi++);
2127f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2128f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2129f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2130f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2131f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2132f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2133f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
21344e2b4712SSatish Balay       v += 25;
21354e2b4712SSatish Balay     }
21364e2b4712SSatish Balay     idx = 5*i;
2137f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2138f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
21394e2b4712SSatish Balay   }
21404e2b4712SSatish Balay   /* backward solve the upper triangular */
21414e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21424e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
21434e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21444e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
21454e2b4712SSatish Balay     idt  = 5*i;
2146f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2147f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
21484e2b4712SSatish Balay     while (nz--) {
21494e2b4712SSatish Balay       idx   = 5*(*vi++);
2150f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2151f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2152f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2153f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2154f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2155f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2156f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
21574e2b4712SSatish Balay       v += 25;
21584e2b4712SSatish Balay     }
21594e2b4712SSatish Balay     idc = 5*(*c--);
21604e2b4712SSatish Balay     v   = aa + 25*diag[i];
2161f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2162f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
2163f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2164f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
2165f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2166f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
2167f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2168f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
2169f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2170f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
21714e2b4712SSatish Balay   }
21724e2b4712SSatish Balay 
21734e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21744e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2175d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21761ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2177dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
21784e2b4712SSatish Balay   PetscFunctionReturn(0);
21794e2b4712SSatish Balay }
21804e2b4712SSatish Balay 
21814a2ae208SSatish Balay #undef __FUNCT__
21828f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
21838f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
21848f690400SShri Abhyankar {
21858f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21868f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
21878f690400SShri Abhyankar   PetscErrorCode    ierr;
21888f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
218929b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
21908f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
21918f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
21928f690400SShri Abhyankar   const PetscScalar *b;
21938f690400SShri Abhyankar 
21948f690400SShri Abhyankar   PetscFunctionBegin;
21958f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21968f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21978f690400SShri Abhyankar   t  = a->solve_work;
21988f690400SShri Abhyankar 
21998f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
220029b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22018f690400SShri Abhyankar 
22028f690400SShri Abhyankar   /* forward solve the lower triangular */
220329b92fc1SShri Abhyankar   idx    = 5*r[0];
22048f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
22058f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
22068f690400SShri Abhyankar   for (i=1; i<n; i++) {
22078f690400SShri Abhyankar     v     = aa + 25*ai[i];
22088f690400SShri Abhyankar     vi    = aj + ai[i];
22098f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
221029b92fc1SShri Abhyankar     idx   = 5*r[i];
22118f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
22128f690400SShri Abhyankar     s5  = b[4+idx];
221329b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
221429b92fc1SShri Abhyankar       idx   = 5*vi[m];
22158f690400SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
22168f690400SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
22178f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
22188f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
22198f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
22208f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
22218f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
22228f690400SShri Abhyankar       v += 25;
22238f690400SShri Abhyankar     }
22248f690400SShri Abhyankar     idx = 5*i;
22258f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
22268f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
22278f690400SShri Abhyankar   }
22288f690400SShri Abhyankar   /* backward solve the upper triangular */
22298f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
22308f690400SShri Abhyankar     k    = 2*n-i;
22318f690400SShri Abhyankar     v    = aa + 25*ai[k];
22328f690400SShri Abhyankar     vi   = aj + ai[k];
22338f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
22348f690400SShri Abhyankar     idt  = 5*i;
22358f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
22368f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
223729b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
223829b92fc1SShri Abhyankar       idx   = 5*vi[m];
22398f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
22408f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
22418f690400SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
22428f690400SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
22438f690400SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
22448f690400SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
22458f690400SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
22468f690400SShri Abhyankar       v += 25;
22478f690400SShri Abhyankar     }
224829b92fc1SShri Abhyankar     idc = 5*c[i];
22498f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
22508f690400SShri Abhyankar                                  v[15]*s4+v[20]*s5;
22518f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
22528f690400SShri Abhyankar                                  v[16]*s4+v[21]*s5;
22538f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
22548f690400SShri Abhyankar                                  v[17]*s4+v[22]*s5;
22558f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
22568f690400SShri Abhyankar                                  v[18]*s4+v[23]*s5;
22578f690400SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
22588f690400SShri Abhyankar                                  v[19]*s4+v[24]*s5;
22598f690400SShri Abhyankar   }
22608f690400SShri Abhyankar 
22618f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22628f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22638f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22648f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22658f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
22668f690400SShri Abhyankar   PetscFunctionReturn(0);
22678f690400SShri Abhyankar }
2268*78bb4007SShri Abhyankar 
2269*78bb4007SShri Abhyankar #undef __FUNCT__
2270*78bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct_v2"
2271*78bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2272*78bb4007SShri Abhyankar {
2273*78bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2274*78bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
2275*78bb4007SShri Abhyankar   PetscErrorCode    ierr;
2276*78bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2277*78bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2278*78bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2279*78bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2280*78bb4007SShri Abhyankar   const PetscScalar *b;
2281*78bb4007SShri Abhyankar 
2282*78bb4007SShri Abhyankar   PetscFunctionBegin;
2283*78bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2284*78bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2285*78bb4007SShri Abhyankar   t  = a->solve_work;
2286*78bb4007SShri Abhyankar 
2287*78bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2288*78bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2289*78bb4007SShri Abhyankar 
2290*78bb4007SShri Abhyankar   /* forward solve the lower triangular */
2291*78bb4007SShri Abhyankar   idx    = 5*r[0];
2292*78bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
2293*78bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2294*78bb4007SShri Abhyankar   for (i=1; i<n; i++) {
2295*78bb4007SShri Abhyankar     v     = aa + 25*ai[i];
2296*78bb4007SShri Abhyankar     vi    = aj + ai[i];
2297*78bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
2298*78bb4007SShri Abhyankar     idx   = 5*r[i];
2299*78bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2300*78bb4007SShri Abhyankar     s5  = b[4+idx];
2301*78bb4007SShri Abhyankar     for(m=0;m<nz;m++){
2302*78bb4007SShri Abhyankar       idx   = 5*vi[m];
2303*78bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2304*78bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
2305*78bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2306*78bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2307*78bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2308*78bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2309*78bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2310*78bb4007SShri Abhyankar       v += 25;
2311*78bb4007SShri Abhyankar     }
2312*78bb4007SShri Abhyankar     idx = 5*i;
2313*78bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
2314*78bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2315*78bb4007SShri Abhyankar   }
2316*78bb4007SShri Abhyankar   /* backward solve the upper triangular */
2317*78bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
2318*78bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
2319*78bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
2320*78bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
2321*78bb4007SShri Abhyankar     idt  = 5*i;
2322*78bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
2323*78bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2324*78bb4007SShri Abhyankar     for(m=0;m<nz;m++){
2325*78bb4007SShri Abhyankar       idx   = 5*vi[m];
2326*78bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
2327*78bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2328*78bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2329*78bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2330*78bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2331*78bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2332*78bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
2333*78bb4007SShri Abhyankar       v += 25;
2334*78bb4007SShri Abhyankar     }
2335*78bb4007SShri Abhyankar     idc = 5*c[i];
2336*78bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2337*78bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
2338*78bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2339*78bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
2340*78bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2341*78bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
2342*78bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2343*78bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
2344*78bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2345*78bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
2346*78bb4007SShri Abhyankar   }
2347*78bb4007SShri Abhyankar 
2348*78bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2349*78bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2350*78bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2351*78bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2352*78bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2353*78bb4007SShri Abhyankar   PetscFunctionReturn(0);
2354*78bb4007SShri Abhyankar }
2355*78bb4007SShri Abhyankar 
23568f690400SShri Abhyankar #undef __FUNCT__
23574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2358dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
235915091d37SBarry Smith {
236015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2361690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2362dfbe8321SBarry Smith   PetscErrorCode    ierr;
2363690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2364d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2365d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2366d9fead3dSBarry Smith   const PetscScalar *b;
236715091d37SBarry Smith 
236815091d37SBarry Smith   PetscFunctionBegin;
2369d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
237115091d37SBarry Smith   /* forward solve the lower triangular */
237215091d37SBarry Smith   idx    = 0;
237315091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
237415091d37SBarry Smith   for (i=1; i<n; i++) {
237515091d37SBarry Smith     v     =  aa + 25*ai[i];
237615091d37SBarry Smith     vi    =  aj + ai[i];
237715091d37SBarry Smith     nz    =  diag[i] - ai[i];
237815091d37SBarry Smith     idx   =  5*i;
2379f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
238015091d37SBarry Smith     while (nz--) {
238115091d37SBarry Smith       jdx   = 5*(*vi++);
238215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2383f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2384f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2385f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2386f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2387f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
238815091d37SBarry Smith       v    += 25;
238915091d37SBarry Smith     }
2390f1af5d2fSBarry Smith     x[idx]   = s1;
2391f1af5d2fSBarry Smith     x[1+idx] = s2;
2392f1af5d2fSBarry Smith     x[2+idx] = s3;
2393f1af5d2fSBarry Smith     x[3+idx] = s4;
2394f1af5d2fSBarry Smith     x[4+idx] = s5;
239515091d37SBarry Smith   }
239615091d37SBarry Smith   /* backward solve the upper triangular */
239715091d37SBarry Smith   for (i=n-1; i>=0; i--){
239815091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
239915091d37SBarry Smith     vi   = aj + diag[i] + 1;
240015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
240115091d37SBarry Smith     idt  = 5*i;
2402f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2403f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
240415091d37SBarry Smith     while (nz--) {
240515091d37SBarry Smith       idx   = 5*(*vi++);
240615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2407f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2408f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2409f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2410f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2411f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
241215091d37SBarry Smith       v    += 25;
241315091d37SBarry Smith     }
241415091d37SBarry Smith     v        = aa + 25*diag[i];
2415f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2416f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2417f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2418f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2419f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
242015091d37SBarry Smith   }
242115091d37SBarry Smith 
2422d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24231ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2424dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
242515091d37SBarry Smith   PetscFunctionReturn(0);
242615091d37SBarry Smith }
242715091d37SBarry Smith 
24284a2ae208SSatish Balay #undef __FUNCT__
2429cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2430cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2431cee9d6f2SShri Abhyankar {
2432cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
24336464896eSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2434cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
2435cee9d6f2SShri Abhyankar   PetscInt          jdx;
2436cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2437cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2438cee9d6f2SShri Abhyankar   const PetscScalar *b;
2439cee9d6f2SShri Abhyankar 
2440cee9d6f2SShri Abhyankar   PetscFunctionBegin;
2441cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2442cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2443cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
2444cee9d6f2SShri Abhyankar   idx    = 0;
2445cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
2446cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
2447cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
2448cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
2449cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
2450cee9d6f2SShri Abhyankar     idx = 5*i;
2451cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
24526464896eSShri Abhyankar     for(k=0;k<nz;k++) {
24536464896eSShri Abhyankar       jdx   = 5*vi[k];
2454cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2455cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2456cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2457cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2458cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2459cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2460cee9d6f2SShri Abhyankar       v    += 25;
2461cee9d6f2SShri Abhyankar     }
2462cee9d6f2SShri Abhyankar     x[idx]   = s1;
2463cee9d6f2SShri Abhyankar     x[1+idx] = s2;
2464cee9d6f2SShri Abhyankar     x[2+idx] = s3;
2465cee9d6f2SShri Abhyankar     x[3+idx] = s4;
2466cee9d6f2SShri Abhyankar     x[4+idx] = s5;
2467cee9d6f2SShri Abhyankar   }
2468cee9d6f2SShri Abhyankar 
2469cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
2470cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2471cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
2472cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
2473cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2474cee9d6f2SShri Abhyankar     idt = 5*i;
2475cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
2476cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
24776464896eSShri Abhyankar     for(k=0;k<nz;k++){
24786464896eSShri Abhyankar       idx   = 5*vi[k];
2479cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2480cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2481cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2482cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2483cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2484cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
2485cee9d6f2SShri Abhyankar       v    += 25;
2486cee9d6f2SShri Abhyankar     }
2487cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2488cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2489cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2490cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2491cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2492cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
2493cee9d6f2SShri Abhyankar   }
2494cee9d6f2SShri Abhyankar 
2495cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2496cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2497cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
2498cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2499cee9d6f2SShri Abhyankar }
2500cee9d6f2SShri Abhyankar 
2501cee9d6f2SShri Abhyankar #undef __FUNCT__
250253cca76cSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2"
250353cca76cSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
250453cca76cSShri Abhyankar {
250553cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
250653cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
250753cca76cSShri Abhyankar   PetscErrorCode    ierr;
250853cca76cSShri Abhyankar   PetscInt          jdx;
250953cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
251053cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
251153cca76cSShri Abhyankar   const PetscScalar *b;
251253cca76cSShri Abhyankar 
251353cca76cSShri Abhyankar   PetscFunctionBegin;
251453cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
251553cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251653cca76cSShri Abhyankar   /* forward solve the lower triangular */
251753cca76cSShri Abhyankar   idx    = 0;
251853cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
251953cca76cSShri Abhyankar   for (i=1; i<n; i++) {
252053cca76cSShri Abhyankar     v   = aa + 25*ai[i];
252153cca76cSShri Abhyankar     vi  = aj + ai[i];
252253cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
252353cca76cSShri Abhyankar     idx = 5*i;
252453cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
252553cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
252653cca76cSShri Abhyankar       jdx   = 5*vi[k];
252753cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
252853cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
252953cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
253053cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
253153cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
253253cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
253353cca76cSShri Abhyankar       v    += 25;
253453cca76cSShri Abhyankar     }
253553cca76cSShri Abhyankar     x[idx]   = s1;
253653cca76cSShri Abhyankar     x[1+idx] = s2;
253753cca76cSShri Abhyankar     x[2+idx] = s3;
253853cca76cSShri Abhyankar     x[3+idx] = s4;
253953cca76cSShri Abhyankar     x[4+idx] = s5;
254053cca76cSShri Abhyankar   }
254153cca76cSShri Abhyankar 
254253cca76cSShri Abhyankar   /* backward solve the upper triangular */
254353cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
254453cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
254553cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
254653cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
254753cca76cSShri Abhyankar     idt = 5*i;
254853cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
254953cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
255053cca76cSShri Abhyankar     for(k=0;k<nz;k++){
255153cca76cSShri Abhyankar       idx   = 5*vi[k];
255253cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
255353cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
255453cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
255553cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
255653cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
255753cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
255853cca76cSShri Abhyankar       v    += 25;
255953cca76cSShri Abhyankar     }
256053cca76cSShri Abhyankar     /* x = inv_diagonal*x */
256153cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
256253cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
256353cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
256453cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
256553cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
256653cca76cSShri Abhyankar   }
256753cca76cSShri Abhyankar 
256853cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
256953cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
257053cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
257153cca76cSShri Abhyankar   PetscFunctionReturn(0);
257253cca76cSShri Abhyankar }
257353cca76cSShri Abhyankar 
257453cca76cSShri Abhyankar #undef __FUNCT__
25754a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2576dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
25774e2b4712SSatish Balay {
25784e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
25794e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
25806849ba73SBarry Smith   PetscErrorCode    ierr;
25815d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
25825d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2583d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2584d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2585d9fead3dSBarry Smith   const PetscScalar *b;
25864e2b4712SSatish Balay 
25874e2b4712SSatish Balay   PetscFunctionBegin;
2588d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2590f1af5d2fSBarry Smith   t  = a->solve_work;
25914e2b4712SSatish Balay 
25924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25944e2b4712SSatish Balay 
25954e2b4712SSatish Balay   /* forward solve the lower triangular */
25964e2b4712SSatish Balay   idx    = 4*(*r++);
2597f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2598f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
25994e2b4712SSatish Balay   for (i=1; i<n; i++) {
26004e2b4712SSatish Balay     v     = aa + 16*ai[i];
26014e2b4712SSatish Balay     vi    = aj + ai[i];
26024e2b4712SSatish Balay     nz    = diag[i] - ai[i];
26034e2b4712SSatish Balay     idx   = 4*(*r++);
2604f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
26054e2b4712SSatish Balay     while (nz--) {
26064e2b4712SSatish Balay       idx   = 4*(*vi++);
2607f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2608f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2609f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2610f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2611f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
26124e2b4712SSatish Balay       v    += 16;
26134e2b4712SSatish Balay     }
26144e2b4712SSatish Balay     idx        = 4*i;
2615f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2616f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
26174e2b4712SSatish Balay   }
26184e2b4712SSatish Balay   /* backward solve the upper triangular */
26194e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
26204e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
26214e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
26224e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
26234e2b4712SSatish Balay     idt  = 4*i;
2624f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2625f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
26264e2b4712SSatish Balay     while (nz--) {
26274e2b4712SSatish Balay       idx   = 4*(*vi++);
2628f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2629f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2630f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2631f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2632f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2633f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
26344e2b4712SSatish Balay       v += 16;
26354e2b4712SSatish Balay     }
26364e2b4712SSatish Balay     idc      = 4*(*c--);
26374e2b4712SSatish Balay     v        = aa + 16*diag[i];
2638f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2639f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2640f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2641f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
26424e2b4712SSatish Balay   }
26434e2b4712SSatish Balay 
26444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
26454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2646d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26471ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2648dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
26494e2b4712SSatish Balay   PetscFunctionReturn(0);
26504e2b4712SSatish Balay }
2651f26ec98cSKris Buschelman 
2652f26ec98cSKris Buschelman #undef __FUNCT__
26538f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
26548f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
26558f690400SShri Abhyankar {
26568f690400SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
26578f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
26588f690400SShri Abhyankar   PetscErrorCode    ierr;
265929b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
26608f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
26618f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
26628f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
26638f690400SShri Abhyankar   const PetscScalar *b;
26648f690400SShri Abhyankar 
26658f690400SShri Abhyankar   PetscFunctionBegin;
26668f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26678f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26688f690400SShri Abhyankar   t  = a->solve_work;
26698f690400SShri Abhyankar 
26708f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
267129b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
26728f690400SShri Abhyankar 
26738f690400SShri Abhyankar   /* forward solve the lower triangular */
267429b92fc1SShri Abhyankar   idx    = 4*r[0];
26758f690400SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
26768f690400SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
26778f690400SShri Abhyankar   for (i=1; i<n; i++) {
26788f690400SShri Abhyankar     v     = aa + 16*ai[i];
26798f690400SShri Abhyankar     vi    = aj + ai[i];
26808f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
268129b92fc1SShri Abhyankar     idx   = 4*r[i];
26828f690400SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
268329b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
268429b92fc1SShri Abhyankar       idx   = 4*vi[m];
26858f690400SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
26868f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
26878f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
26888f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
26898f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
26908f690400SShri Abhyankar       v    += 16;
26918f690400SShri Abhyankar     }
26928f690400SShri Abhyankar     idx        = 4*i;
26938f690400SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
26948f690400SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
26958f690400SShri Abhyankar   }
26968f690400SShri Abhyankar   /* backward solve the upper triangular */
26978f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
26988f690400SShri Abhyankar     k    = 2*n-i;
26998f690400SShri Abhyankar     v    = aa + 16*ai[k];
27008f690400SShri Abhyankar     vi   = aj + ai[k];
27018f690400SShri Abhyankar     nz   = ai[k+1] - ai[k] - 1;
27028f690400SShri Abhyankar     idt  = 4*i;
27038f690400SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
27048f690400SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
270529b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
270629b92fc1SShri Abhyankar       idx   = 4*vi[m];
27078f690400SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
27088f690400SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
27098f690400SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
27108f690400SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
27118f690400SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
27128f690400SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
27138f690400SShri Abhyankar       v += 16;
27148f690400SShri Abhyankar     }
271529b92fc1SShri Abhyankar     idc      = 4*c[i];
27168f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
27178f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
27188f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
27198f690400SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
27208f690400SShri Abhyankar   }
27218f690400SShri Abhyankar 
27228f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27238f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
27248f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27258f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
27268f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
27278f690400SShri Abhyankar   PetscFunctionReturn(0);
27288f690400SShri Abhyankar }
27298f690400SShri Abhyankar 
27308f690400SShri Abhyankar #undef __FUNCT__
2731*78bb4007SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct_v2"
2732*78bb4007SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct_v2(Mat A,Vec bb,Vec xx)
2733*78bb4007SShri Abhyankar {
2734*78bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2735*78bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
2736*78bb4007SShri Abhyankar   PetscErrorCode    ierr;
2737*78bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
2738*78bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2739*78bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2740*78bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2741*78bb4007SShri Abhyankar   const PetscScalar *b;
2742*78bb4007SShri Abhyankar 
2743*78bb4007SShri Abhyankar   PetscFunctionBegin;
2744*78bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2745*78bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2746*78bb4007SShri Abhyankar   t  = a->solve_work;
2747*78bb4007SShri Abhyankar 
2748*78bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2749*78bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
2750*78bb4007SShri Abhyankar 
2751*78bb4007SShri Abhyankar   /* forward solve the lower triangular */
2752*78bb4007SShri Abhyankar   idx    = 4*r[0];
2753*78bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
2754*78bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
2755*78bb4007SShri Abhyankar   for (i=1; i<n; i++) {
2756*78bb4007SShri Abhyankar     v     = aa + 16*ai[i];
2757*78bb4007SShri Abhyankar     vi    = aj + ai[i];
2758*78bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
2759*78bb4007SShri Abhyankar     idx   = 4*r[i];
2760*78bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2761*78bb4007SShri Abhyankar     for(m=0;m<nz;m++){
2762*78bb4007SShri Abhyankar       idx   = 4*vi[m];
2763*78bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2764*78bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2765*78bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2766*78bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2767*78bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2768*78bb4007SShri Abhyankar       v    += 16;
2769*78bb4007SShri Abhyankar     }
2770*78bb4007SShri Abhyankar     idx        = 4*i;
2771*78bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
2772*78bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
2773*78bb4007SShri Abhyankar   }
2774*78bb4007SShri Abhyankar   /* backward solve the upper triangular */
2775*78bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
2776*78bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
2777*78bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
2778*78bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
2779*78bb4007SShri Abhyankar     idt  = 4*i;
2780*78bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
2781*78bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
2782*78bb4007SShri Abhyankar     for(m=0;m<nz;m++){
2783*78bb4007SShri Abhyankar       idx   = 4*vi[m];
2784*78bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
2785*78bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
2786*78bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2787*78bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2788*78bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2789*78bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2790*78bb4007SShri Abhyankar       v += 16;
2791*78bb4007SShri Abhyankar     }
2792*78bb4007SShri Abhyankar     idc      = 4*c[i];
2793*78bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2794*78bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2795*78bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2796*78bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2797*78bb4007SShri Abhyankar   }
2798*78bb4007SShri Abhyankar 
2799*78bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2800*78bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2801*78bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2802*78bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2803*78bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2804*78bb4007SShri Abhyankar   PetscFunctionReturn(0);
2805*78bb4007SShri Abhyankar }
2806*78bb4007SShri Abhyankar 
2807*78bb4007SShri Abhyankar #undef __FUNCT__
2808f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2809dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2810f26ec98cSKris Buschelman {
2811f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2812f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
28136849ba73SBarry Smith   PetscErrorCode    ierr;
28145d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
28155d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2816d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2817d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2818d9fead3dSBarry Smith   PetscScalar       *x;
2819d9fead3dSBarry Smith   const PetscScalar *b;
2820f26ec98cSKris Buschelman 
2821f26ec98cSKris Buschelman   PetscFunctionBegin;
2822d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28231ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2824f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2825f26ec98cSKris Buschelman 
2826f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2827f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2828f26ec98cSKris Buschelman 
2829f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2830f26ec98cSKris Buschelman   idx    = 4*(*r++);
2831f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2832f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2833f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2834f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2835f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2836f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2837f26ec98cSKris Buschelman     vi    = aj + ai[i];
2838f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2839f26ec98cSKris Buschelman     idx   = 4*(*r++);
2840f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2841f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2842f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2843f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2844f26ec98cSKris Buschelman     while (nz--) {
2845f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2846f26ec98cSKris Buschelman       x1  = t[idx];
2847f26ec98cSKris Buschelman       x2  = t[1+idx];
2848f26ec98cSKris Buschelman       x3  = t[2+idx];
2849f26ec98cSKris Buschelman       x4  = t[3+idx];
2850f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2851f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2852f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2853f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2854f26ec98cSKris Buschelman       v    += 16;
2855f26ec98cSKris Buschelman     }
2856f26ec98cSKris Buschelman     idx        = 4*i;
2857f26ec98cSKris Buschelman     t[idx]   = s1;
2858f26ec98cSKris Buschelman     t[1+idx] = s2;
2859f26ec98cSKris Buschelman     t[2+idx] = s3;
2860f26ec98cSKris Buschelman     t[3+idx] = s4;
2861f26ec98cSKris Buschelman   }
2862f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2863f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2864f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2865f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2866f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2867f26ec98cSKris Buschelman     idt  = 4*i;
2868f26ec98cSKris Buschelman     s1 = t[idt];
2869f26ec98cSKris Buschelman     s2 = t[1+idt];
2870f26ec98cSKris Buschelman     s3 = t[2+idt];
2871f26ec98cSKris Buschelman     s4 = t[3+idt];
2872f26ec98cSKris Buschelman     while (nz--) {
2873f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2874f26ec98cSKris Buschelman       x1  = t[idx];
2875f26ec98cSKris Buschelman       x2  = t[1+idx];
2876f26ec98cSKris Buschelman       x3  = t[2+idx];
2877f26ec98cSKris Buschelman       x4  = t[3+idx];
2878f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2879f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2880f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2881f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2882f26ec98cSKris Buschelman       v += 16;
2883f26ec98cSKris Buschelman     }
2884f26ec98cSKris Buschelman     idc      = 4*(*c--);
2885f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
2886f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2887f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2888f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2889f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2890f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2891f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2892f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2893f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2894f26ec98cSKris Buschelman  }
2895f26ec98cSKris Buschelman 
2896f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2897f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2898d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28991ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2900dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2901f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2902f26ec98cSKris Buschelman }
2903f26ec98cSKris Buschelman 
290424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
290524c233c2SKris Buschelman 
290624c233c2SKris Buschelman #include PETSC_HAVE_SSE
290724c233c2SKris Buschelman 
290824c233c2SKris Buschelman #undef __FUNCT__
290924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2910dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
291124c233c2SKris Buschelman {
291224c233c2SKris Buschelman   /*
291324c233c2SKris Buschelman      Note: This code uses demotion of double
291424c233c2SKris Buschelman      to float when performing the mixed-mode computation.
291524c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
291624c233c2SKris Buschelman   */
291724c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
291824c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
29196849ba73SBarry Smith   PetscErrorCode ierr;
29205d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
29215d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
292224c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
292387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
292424c233c2SKris Buschelman 
292524c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
292624c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
292724c233c2SKris Buschelman   unsigned long   offset;
292824c233c2SKris Buschelman 
292924c233c2SKris Buschelman   PetscFunctionBegin;
293024c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
293124c233c2SKris Buschelman 
293224c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
293324c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
293424c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
293524c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
293624c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
293724c233c2SKris Buschelman 
29381ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
29391ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
294024c233c2SKris Buschelman     t  = a->solve_work;
294124c233c2SKris Buschelman 
294224c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
294324c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
294424c233c2SKris Buschelman 
294524c233c2SKris Buschelman     /* forward solve the lower triangular */
294624c233c2SKris Buschelman     idx  = 4*(*r++);
294724c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
294824c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
294924c233c2SKris Buschelman     v    =  aa + 16*ai[1];
295024c233c2SKris Buschelman 
295124c233c2SKris Buschelman     for (i=1; i<n;) {
295224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
295324c233c2SKris Buschelman       vi   =  aj      + ai[i];
295424c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
295524c233c2SKris Buschelman       idx  =  4*(*r++);
295624c233c2SKris Buschelman 
295724c233c2SKris Buschelman       /* Demote sum from double to float */
295824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
295924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
296024c233c2SKris Buschelman 
296124c233c2SKris Buschelman       while (nz--) {
296224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
296324c233c2SKris Buschelman         idx = 4*(*vi++);
296424c233c2SKris Buschelman 
296524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
296624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
296724c233c2SKris Buschelman 
296824c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
296924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
297024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
297124c233c2SKris Buschelman 
297224c233c2SKris Buschelman           /* First Column */
297324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
297424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
297524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
297624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
297724c233c2SKris Buschelman 
297824c233c2SKris Buschelman           /* Second Column */
297924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
298024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
298124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
298224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
298324c233c2SKris Buschelman 
298424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
298524c233c2SKris Buschelman 
298624c233c2SKris Buschelman           /* Third Column */
298724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
298824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
298924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
299024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
299124c233c2SKris Buschelman 
299224c233c2SKris Buschelman           /* Fourth Column */
299324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
299424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
299524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
299624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
299724c233c2SKris Buschelman         SSE_INLINE_END_2
299824c233c2SKris Buschelman 
299924c233c2SKris Buschelman         v  += 16;
300024c233c2SKris Buschelman       }
300124c233c2SKris Buschelman       idx = 4*i;
300224c233c2SKris Buschelman       v   = aa + 16*ai[++i];
300324c233c2SKris Buschelman       PREFETCH_NTA(v);
300424c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
300524c233c2SKris Buschelman 
300624c233c2SKris Buschelman       /* Promote result from float to double */
300724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
300824c233c2SKris Buschelman     }
300924c233c2SKris Buschelman     /* backward solve the upper triangular */
301024c233c2SKris Buschelman     idt  = 4*(n-1);
301124c233c2SKris Buschelman     ai16 = 16*diag[n-1];
301224c233c2SKris Buschelman     v    = aa + ai16 + 16;
301324c233c2SKris Buschelman     for (i=n-1; i>=0;){
301424c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
301524c233c2SKris Buschelman       vi = aj + diag[i] + 1;
301624c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
301724c233c2SKris Buschelman 
301824c233c2SKris Buschelman       /* Demote accumulator from double to float */
301924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
302024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
302124c233c2SKris Buschelman 
302224c233c2SKris Buschelman       while (nz--) {
302324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
302424c233c2SKris Buschelman         idx = 4*(*vi++);
302524c233c2SKris Buschelman 
302624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
302724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
302824c233c2SKris Buschelman 
302924c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
303024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
303124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
303224c233c2SKris Buschelman 
303324c233c2SKris Buschelman           /* First Column */
303424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
303524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
303624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
303724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
303824c233c2SKris Buschelman 
303924c233c2SKris Buschelman           /* Second Column */
304024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
304124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
304224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
304324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
304424c233c2SKris Buschelman 
304524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
304624c233c2SKris Buschelman 
304724c233c2SKris Buschelman           /* Third Column */
304824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
304924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
305024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
305124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
305224c233c2SKris Buschelman 
305324c233c2SKris Buschelman           /* Fourth Column */
305424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
305524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
305624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
305724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
305824c233c2SKris Buschelman         SSE_INLINE_END_2
305924c233c2SKris Buschelman         v  += 16;
306024c233c2SKris Buschelman       }
306124c233c2SKris Buschelman       v    = aa + ai16;
306224c233c2SKris Buschelman       ai16 = 16*diag[--i];
306324c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
306424c233c2SKris Buschelman       /*
306524c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
306624c233c2SKris Buschelman          which was inverted as part of the factorization
306724c233c2SKris Buschelman       */
306824c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
306924c233c2SKris Buschelman         /* First Column */
307024c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
307124c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
307224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
307324c233c2SKris Buschelman 
307424c233c2SKris Buschelman         /* Second Column */
307524c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
307624c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
307724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
307824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
307924c233c2SKris Buschelman 
308024c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
308124c233c2SKris Buschelman 
308224c233c2SKris Buschelman         /* Third Column */
308324c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
308424c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
308524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
308624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
308724c233c2SKris Buschelman 
308824c233c2SKris Buschelman         /* Fourth Column */
308924c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
309024c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
309124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
309224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
309324c233c2SKris Buschelman 
309424c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
309524c233c2SKris Buschelman       SSE_INLINE_END_3
309624c233c2SKris Buschelman 
309724c233c2SKris Buschelman       /* Promote solution from float to double */
309824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
309924c233c2SKris Buschelman 
310024c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
310124c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
310224c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
310324c233c2SKris Buschelman       idc  = 4*(*c--);
310424c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
310524c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
310624c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
310724c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
310824c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
310924c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
311024c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
311124c233c2SKris Buschelman       SSE_INLINE_END_2
311224c233c2SKris Buschelman       v    = aa + ai16 + 16;
311324c233c2SKris Buschelman       idt -= 4;
311424c233c2SKris Buschelman     }
311524c233c2SKris Buschelman 
311624c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
311724c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
31181ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
31191ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3120dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
312124c233c2SKris Buschelman   SSE_SCOPE_END;
312224c233c2SKris Buschelman   PetscFunctionReturn(0);
312324c233c2SKris Buschelman }
312424c233c2SKris Buschelman 
312524c233c2SKris Buschelman #endif
31260ef38995SBarry Smith 
31270ef38995SBarry Smith 
31284e2b4712SSatish Balay /*
31294e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
31304e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
31314e2b4712SSatish Balay */
31324a2ae208SSatish Balay #undef __FUNCT__
31334a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3134dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
31354e2b4712SSatish Balay {
31364e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3137356650c2SBarry Smith   PetscInt          n=a->mbs;
3138356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
3139dfbe8321SBarry Smith   PetscErrorCode    ierr;
3140356650c2SBarry Smith   const PetscInt    *diag = a->diag;
3141d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
3142d9fead3dSBarry Smith   PetscScalar       *x;
3143d9fead3dSBarry Smith   const PetscScalar *b;
31444e2b4712SSatish Balay 
31454e2b4712SSatish Balay   PetscFunctionBegin;
3146d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31471ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
31484e2b4712SSatish Balay 
3149aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
31502853dc0eSBarry Smith   {
315187828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
31522853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
31532853dc0eSBarry Smith   }
3154aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
31552853dc0eSBarry Smith   {
315687828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
31572853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
31582853dc0eSBarry Smith   }
3159aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
31602853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3161e1293385SBarry Smith #else
316230d4dcafSBarry Smith   {
316387828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3164d9fead3dSBarry Smith     const MatScalar *v;
3165356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
3166356650c2SBarry Smith     const PetscInt  *vi;
3167e1293385SBarry Smith 
31684e2b4712SSatish Balay   /* forward solve the lower triangular */
31694e2b4712SSatish Balay   idx    = 0;
3170e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
31714e2b4712SSatish Balay   for (i=1; i<n; i++) {
31724e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
31734e2b4712SSatish Balay     vi    =  aj      + ai[i];
31744e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3175e1293385SBarry Smith     idx   +=  4;
3176f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
31774e2b4712SSatish Balay     while (nz--) {
31784e2b4712SSatish Balay       jdx   = 4*(*vi++);
31794e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3180f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3181f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3182f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3183f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
31844e2b4712SSatish Balay       v    += 16;
31854e2b4712SSatish Balay     }
3186f1af5d2fSBarry Smith     x[idx]   = s1;
3187f1af5d2fSBarry Smith     x[1+idx] = s2;
3188f1af5d2fSBarry Smith     x[2+idx] = s3;
3189f1af5d2fSBarry Smith     x[3+idx] = s4;
31904e2b4712SSatish Balay   }
31914e2b4712SSatish Balay   /* backward solve the upper triangular */
31924e555682SBarry Smith   idt = 4*(n-1);
31934e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
31944e555682SBarry Smith     ai16 = 16*diag[i];
31954e555682SBarry Smith     v    = aa + ai16 + 16;
31964e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
31974e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3198f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3199f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
32004e2b4712SSatish Balay     while (nz--) {
32014e2b4712SSatish Balay       idx   = 4*(*vi++);
32024e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3203f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3204f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3205f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3206f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
32074e2b4712SSatish Balay       v    += 16;
32084e2b4712SSatish Balay     }
32094e555682SBarry Smith     v        = aa + ai16;
3210f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3211f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3212f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3213f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3214329f5518SBarry Smith     idt -= 4;
32154e2b4712SSatish Balay   }
321630d4dcafSBarry Smith   }
3217e1293385SBarry Smith #endif
32184e2b4712SSatish Balay 
3219d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3221dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
32224e2b4712SSatish Balay   PetscFunctionReturn(0);
32234e2b4712SSatish Balay }
32244e2b4712SSatish Balay 
3225f26ec98cSKris Buschelman #undef __FUNCT__
3226cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3227cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3228cee9d6f2SShri Abhyankar {
3229cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
32306464896eSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3231cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3232cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3233cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3234cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3235cee9d6f2SShri Abhyankar     PetscScalar       *x;
3236cee9d6f2SShri Abhyankar     const PetscScalar *b;
3237cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3238cee9d6f2SShri Abhyankar 
3239cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3240cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3241cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3242cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3243cee9d6f2SShri Abhyankar     idx    = 0;
3244cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3245cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3246cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3247cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3248cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3249cee9d6f2SShri Abhyankar       idx   = bs*i;
3250cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
32516464896eSShri Abhyankar       for(k=0;k<nz;k++) {
32526464896eSShri Abhyankar           jdx   = bs*vi[k];
3253cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3254cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3255cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3256cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3257cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3258cee9d6f2SShri Abhyankar 
3259cee9d6f2SShri Abhyankar           v   +=  bs2;
3260cee9d6f2SShri Abhyankar         }
3261cee9d6f2SShri Abhyankar 
3262cee9d6f2SShri Abhyankar        x[idx]   = s1;
3263cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3264cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3265cee9d6f2SShri Abhyankar        x[3+idx] = s4;
3266cee9d6f2SShri Abhyankar     }
3267cee9d6f2SShri Abhyankar 
3268cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3269cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3270cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3271cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3272cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3273cee9d6f2SShri Abhyankar      idt = bs*i;
3274cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3275cee9d6f2SShri Abhyankar 
32766464896eSShri Abhyankar     for(k=0;k<nz;k++){
32776464896eSShri Abhyankar       idx   = bs*vi[k];
3278cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3279cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3280cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3281cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3282cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3283cee9d6f2SShri Abhyankar 
3284cee9d6f2SShri Abhyankar         v   +=  bs2;
3285cee9d6f2SShri Abhyankar     }
3286cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3287cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3288cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3289cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3290cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3291cee9d6f2SShri Abhyankar 
3292cee9d6f2SShri Abhyankar   }
3293cee9d6f2SShri Abhyankar 
3294cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3295cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3296cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3297cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3298cee9d6f2SShri Abhyankar }
3299cee9d6f2SShri Abhyankar 
3300b2b2dd24SShri Abhyankar #undef __FUNCT__
3301b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2"
3302b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
3303b2b2dd24SShri Abhyankar {
3304b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3305b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3306b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3307b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3308b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3309b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3310b2b2dd24SShri Abhyankar     PetscScalar       *x;
3311b2b2dd24SShri Abhyankar     const PetscScalar *b;
3312b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3313cee9d6f2SShri Abhyankar 
3314b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3315b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3316b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3317b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3318b2b2dd24SShri Abhyankar     idx    = 0;
3319b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3320b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3321b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3322b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3323b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3324b2b2dd24SShri Abhyankar       idx   = bs*i;
3325b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3326b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
3327b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
3328b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3329b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3330b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3331b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3332b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3333b2b2dd24SShri Abhyankar 
3334b2b2dd24SShri Abhyankar           v   +=  bs2;
3335b2b2dd24SShri Abhyankar         }
3336b2b2dd24SShri Abhyankar 
3337b2b2dd24SShri Abhyankar        x[idx]   = s1;
3338b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3339b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3340b2b2dd24SShri Abhyankar        x[3+idx] = s4;
3341b2b2dd24SShri Abhyankar     }
3342b2b2dd24SShri Abhyankar 
3343b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3344b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3345b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3346b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3347b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3348b2b2dd24SShri Abhyankar      idt = bs*i;
3349b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3350b2b2dd24SShri Abhyankar 
3351b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
3352b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
3353b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3354b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3355b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3356b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3357b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3358b2b2dd24SShri Abhyankar 
3359b2b2dd24SShri Abhyankar         v   +=  bs2;
3360b2b2dd24SShri Abhyankar     }
3361b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3362b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3363b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3364b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3365b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3366b2b2dd24SShri Abhyankar 
3367b2b2dd24SShri Abhyankar   }
3368b2b2dd24SShri Abhyankar 
3369b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3370b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3371b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3372b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3373b2b2dd24SShri Abhyankar }
3374cee9d6f2SShri Abhyankar 
3375cee9d6f2SShri Abhyankar #undef __FUNCT__
3376f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3377dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3378f26ec98cSKris Buschelman {
3379f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3380690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3381dfbe8321SBarry Smith   PetscErrorCode ierr;
3382690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3383f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3384f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3385f26ec98cSKris Buschelman 
3386f26ec98cSKris Buschelman   PetscFunctionBegin;
33871ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
33881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3389f26ec98cSKris Buschelman 
3390f26ec98cSKris Buschelman   {
3391f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3392f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3393690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3394f26ec98cSKris Buschelman 
3395f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3396f26ec98cSKris Buschelman     idx  = 0;
3397f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3398f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3399f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3400f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3401f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3402f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3403f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3404f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3405f26ec98cSKris Buschelman       idx   +=  4;
3406f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3407f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3408f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3409f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3410f26ec98cSKris Buschelman       while (nz--) {
3411f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3412f26ec98cSKris Buschelman         x1  = t[jdx];
3413f26ec98cSKris Buschelman         x2  = t[1+jdx];
3414f26ec98cSKris Buschelman         x3  = t[2+jdx];
3415f26ec98cSKris Buschelman         x4  = t[3+jdx];
3416f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3417f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3418f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3419f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3420f26ec98cSKris Buschelman         v    += 16;
3421f26ec98cSKris Buschelman       }
3422f26ec98cSKris Buschelman       t[idx]   = s1;
3423f26ec98cSKris Buschelman       t[1+idx] = s2;
3424f26ec98cSKris Buschelman       t[2+idx] = s3;
3425f26ec98cSKris Buschelman       t[3+idx] = s4;
3426f26ec98cSKris Buschelman     }
3427f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3428f26ec98cSKris Buschelman     idt = 4*(n-1);
3429f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3430f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3431f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3432f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3433f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3434f26ec98cSKris Buschelman       s1   = t[idt];
3435f26ec98cSKris Buschelman       s2   = t[1+idt];
3436f26ec98cSKris Buschelman       s3   = t[2+idt];
3437f26ec98cSKris Buschelman       s4   = t[3+idt];
3438f26ec98cSKris Buschelman       while (nz--) {
3439f26ec98cSKris Buschelman         idx = 4*(*vi++);
3440f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3441f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3442f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3443f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3444f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3445f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3446f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3447f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3448f26ec98cSKris Buschelman         v    += 16;
3449f26ec98cSKris Buschelman       }
3450f26ec98cSKris Buschelman       v        = aa + ai16;
3451f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3452f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3453f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3454f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3455f26ec98cSKris Buschelman       idt -= 4;
3456f26ec98cSKris Buschelman     }
3457f26ec98cSKris Buschelman   }
3458f26ec98cSKris Buschelman 
34591ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
34601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3461dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3462f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3463f26ec98cSKris Buschelman }
3464f26ec98cSKris Buschelman 
34653660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
34663660e330SKris Buschelman 
34673660e330SKris Buschelman #include PETSC_HAVE_SSE
34683660e330SKris Buschelman #undef __FUNCT__
34697cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3470dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
34713660e330SKris Buschelman {
34723660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
34732aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3474dfbe8321SBarry Smith   PetscErrorCode ierr;
3475dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
34763660e330SKris Buschelman   MatScalar      *aa=a->a;
347787828ca2SBarry Smith   PetscScalar    *x,*b;
34783660e330SKris Buschelman 
34793660e330SKris Buschelman   PetscFunctionBegin;
34803660e330SKris Buschelman   SSE_SCOPE_BEGIN;
34813660e330SKris Buschelman   /*
34823660e330SKris Buschelman      Note: This code currently uses demotion of double
34833660e330SKris Buschelman      to float when performing the mixed-mode computation.
34843660e330SKris Buschelman      This may not be numerically reasonable for all applications.
34853660e330SKris Buschelman   */
34863660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
34873660e330SKris Buschelman 
34881ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
34891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
34903660e330SKris Buschelman   {
3491eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3492eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
34932aa5897fSKris Buschelman     int            nz,i,idt,ai16;
34942aa5897fSKris Buschelman     unsigned int   jdx,idx;
34952aa5897fSKris Buschelman     unsigned short *vi;
3496eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
34973660e330SKris Buschelman 
3498eb05f457SKris Buschelman     /* First block is the identity. */
34993660e330SKris Buschelman     idx  = 0;
3500eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
35012aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
35023660e330SKris Buschelman 
35033660e330SKris Buschelman     for (i=1; i<n;) {
35043660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
35053660e330SKris Buschelman       vi   =  aj      + ai[i];
35063660e330SKris Buschelman       nz   =  diag[i] - ai[i];
35073660e330SKris Buschelman       idx +=  4;
35083660e330SKris Buschelman 
3509eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3510eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3511eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
35123660e330SKris Buschelman 
35133660e330SKris Buschelman       while (nz--) {
35143660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
35152aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
35163660e330SKris Buschelman 
35173660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3518eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
35193660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
35203660e330SKris Buschelman 
35213660e330SKris Buschelman           /* First Column */
35223660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
35233660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
35243660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
35253660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
35263660e330SKris Buschelman 
35273660e330SKris Buschelman           /* Second Column */
35283660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
35293660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
35303660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
35313660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
35323660e330SKris Buschelman 
35333660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
35343660e330SKris Buschelman 
35353660e330SKris Buschelman           /* Third Column */
35363660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
35373660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
35383660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
35393660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
35403660e330SKris Buschelman 
35413660e330SKris Buschelman           /* Fourth Column */
35423660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
35433660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
35443660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
35453660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
35463660e330SKris Buschelman         SSE_INLINE_END_2
35473660e330SKris Buschelman 
35483660e330SKris Buschelman         v  += 16;
35493660e330SKris Buschelman       }
35503660e330SKris Buschelman       v    =  aa + 16*ai[++i];
35513660e330SKris Buschelman       PREFETCH_NTA(v);
3552eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
35533660e330SKris Buschelman     }
3554eb05f457SKris Buschelman 
3555eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3556eb05f457SKris Buschelman 
35573660e330SKris Buschelman     idt  = 4*(n-1);
35583660e330SKris Buschelman     ai16 = 16*diag[n-1];
35593660e330SKris Buschelman     v    = aa + ai16 + 16;
35603660e330SKris Buschelman     for (i=n-1; i>=0;){
35613660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
35623660e330SKris Buschelman       vi = aj + diag[i] + 1;
35633660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
35643660e330SKris Buschelman 
3565eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
35663660e330SKris Buschelman 
35673660e330SKris Buschelman       while (nz--) {
35683660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
35692aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
35703660e330SKris Buschelman 
35713660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3572eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
35733660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
35743660e330SKris Buschelman 
35753660e330SKris Buschelman           /* First Column */
35763660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
35773660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
35783660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
35793660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
35803660e330SKris Buschelman 
35813660e330SKris Buschelman           /* Second Column */
35823660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
35833660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
35843660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
35853660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
35863660e330SKris Buschelman 
35873660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
35883660e330SKris Buschelman 
35893660e330SKris Buschelman           /* Third Column */
35903660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
35913660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
35923660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
35933660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
35943660e330SKris Buschelman 
35953660e330SKris Buschelman           /* Fourth Column */
35963660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
35973660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
35983660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
35993660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
36003660e330SKris Buschelman         SSE_INLINE_END_2
36013660e330SKris Buschelman         v  += 16;
36023660e330SKris Buschelman       }
36033660e330SKris Buschelman       v    = aa + ai16;
36043660e330SKris Buschelman       ai16 = 16*diag[--i];
36053660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
36063660e330SKris Buschelman       /*
36073660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
36083660e330SKris Buschelman          which was inverted as part of the factorization
36093660e330SKris Buschelman       */
3610eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
36113660e330SKris Buschelman         /* First Column */
36123660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
36133660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
36143660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
36153660e330SKris Buschelman 
36163660e330SKris Buschelman         /* Second Column */
36173660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
36183660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
36193660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
36203660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
36213660e330SKris Buschelman 
36223660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
36233660e330SKris Buschelman 
36243660e330SKris Buschelman         /* Third Column */
36253660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
36263660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
36273660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
36283660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
36293660e330SKris Buschelman 
36303660e330SKris Buschelman         /* Fourth Column */
36313660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
36323660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
36333660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
36343660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
36353660e330SKris Buschelman 
36363660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
36373660e330SKris Buschelman       SSE_INLINE_END_3
36383660e330SKris Buschelman 
36393660e330SKris Buschelman       v    = aa + ai16 + 16;
36403660e330SKris Buschelman       idt -= 4;
36413660e330SKris Buschelman     }
3642eb05f457SKris Buschelman 
3643eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3644eb05f457SKris Buschelman     idt = 4*(n-1);
3645eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3646eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3647eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3648eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3649eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3650eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3651eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3652eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3653eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
365454693613SKris Buschelman       idt -= 4;
36553660e330SKris Buschelman     }
3656eb05f457SKris Buschelman 
3657eb05f457SKris Buschelman   } /* End of artificial scope. */
36581ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
36591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3660dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
36613660e330SKris Buschelman   SSE_SCOPE_END;
36623660e330SKris Buschelman   PetscFunctionReturn(0);
36633660e330SKris Buschelman }
36643660e330SKris Buschelman 
36657cf1b8d3SKris Buschelman #undef __FUNCT__
36667cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3667dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
36687cf1b8d3SKris Buschelman {
36697cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
36707cf1b8d3SKris Buschelman   int            *aj=a->j;
3671dfbe8321SBarry Smith   PetscErrorCode ierr;
3672dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
36737cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
36747cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
36757cf1b8d3SKris Buschelman 
36767cf1b8d3SKris Buschelman   PetscFunctionBegin;
36777cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
36787cf1b8d3SKris Buschelman   /*
36797cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
36807cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
36817cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
36827cf1b8d3SKris Buschelman   */
36837cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
36847cf1b8d3SKris Buschelman 
36851ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
36861ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
36877cf1b8d3SKris Buschelman   {
36887cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
36897cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
36907cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
36917cf1b8d3SKris Buschelman     int       jdx,idx;
36927cf1b8d3SKris Buschelman     int       *vi;
36937cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
36947cf1b8d3SKris Buschelman 
36957cf1b8d3SKris Buschelman     /* First block is the identity. */
36967cf1b8d3SKris Buschelman     idx  = 0;
36977cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
36987cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
36997cf1b8d3SKris Buschelman 
37007cf1b8d3SKris Buschelman     for (i=1; i<n;) {
37017cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
37027cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
37037cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
37047cf1b8d3SKris Buschelman       idx +=  4;
37057cf1b8d3SKris Buschelman 
37067cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
37077cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
37087cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
37097cf1b8d3SKris Buschelman 
37107cf1b8d3SKris Buschelman       while (nz--) {
37117cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
37127cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
37137cf1b8d3SKris Buschelman /*          jdx = *vi++; */
37147cf1b8d3SKris Buschelman 
37157cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
37167cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
37177cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
37187cf1b8d3SKris Buschelman 
37197cf1b8d3SKris Buschelman           /* First Column */
37207cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
37217cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
37227cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
37237cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
37247cf1b8d3SKris Buschelman 
37257cf1b8d3SKris Buschelman           /* Second Column */
37267cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
37277cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
37287cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
37297cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
37307cf1b8d3SKris Buschelman 
37317cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
37327cf1b8d3SKris Buschelman 
37337cf1b8d3SKris Buschelman           /* Third Column */
37347cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
37357cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
37367cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
37377cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
37387cf1b8d3SKris Buschelman 
37397cf1b8d3SKris Buschelman           /* Fourth Column */
37407cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
37417cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
37427cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
37437cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
37447cf1b8d3SKris Buschelman         SSE_INLINE_END_2
37457cf1b8d3SKris Buschelman 
37467cf1b8d3SKris Buschelman         v  += 16;
37477cf1b8d3SKris Buschelman       }
37487cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
37497cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
37507cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
37517cf1b8d3SKris Buschelman     }
37527cf1b8d3SKris Buschelman 
37537cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
37547cf1b8d3SKris Buschelman 
37557cf1b8d3SKris Buschelman     idt  = 4*(n-1);
37567cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
37577cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
37587cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
37597cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
37607cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
37617cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
37627cf1b8d3SKris Buschelman 
37637cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
37647cf1b8d3SKris Buschelman 
37657cf1b8d3SKris Buschelman       while (nz--) {
37667cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
37677cf1b8d3SKris Buschelman         idx = 4*(*vi++);
37687cf1b8d3SKris Buschelman /*          idx = *vi++; */
37697cf1b8d3SKris Buschelman 
37707cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
37717cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
37727cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
37737cf1b8d3SKris Buschelman 
37747cf1b8d3SKris Buschelman           /* First Column */
37757cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
37767cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
37777cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
37787cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
37797cf1b8d3SKris Buschelman 
37807cf1b8d3SKris Buschelman           /* Second Column */
37817cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
37827cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
37837cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
37847cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
37857cf1b8d3SKris Buschelman 
37867cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
37877cf1b8d3SKris Buschelman 
37887cf1b8d3SKris Buschelman           /* Third Column */
37897cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
37907cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
37917cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
37927cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
37937cf1b8d3SKris Buschelman 
37947cf1b8d3SKris Buschelman           /* Fourth Column */
37957cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
37967cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
37977cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
37987cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
37997cf1b8d3SKris Buschelman         SSE_INLINE_END_2
38007cf1b8d3SKris Buschelman         v  += 16;
38017cf1b8d3SKris Buschelman       }
38027cf1b8d3SKris Buschelman       v    = aa + ai16;
38037cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
38047cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
38057cf1b8d3SKris Buschelman       /*
38067cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
38077cf1b8d3SKris Buschelman          which was inverted as part of the factorization
38087cf1b8d3SKris Buschelman       */
38097cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
38107cf1b8d3SKris Buschelman         /* First Column */
38117cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
38127cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
38137cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
38147cf1b8d3SKris Buschelman 
38157cf1b8d3SKris Buschelman         /* Second Column */
38167cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
38177cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
38187cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
38197cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
38207cf1b8d3SKris Buschelman 
38217cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
38227cf1b8d3SKris Buschelman 
38237cf1b8d3SKris Buschelman         /* Third Column */
38247cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
38257cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
38267cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
38277cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
38287cf1b8d3SKris Buschelman 
38297cf1b8d3SKris Buschelman         /* Fourth Column */
38307cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
38317cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
38327cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
38337cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
38347cf1b8d3SKris Buschelman 
38357cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
38367cf1b8d3SKris Buschelman       SSE_INLINE_END_3
38377cf1b8d3SKris Buschelman 
38387cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
38397cf1b8d3SKris Buschelman       idt -= 4;
38407cf1b8d3SKris Buschelman     }
38417cf1b8d3SKris Buschelman 
38427cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
38437cf1b8d3SKris Buschelman     idt = 4*(n-1);
38447cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
38457cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
38467cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
38477cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
38487cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
38497cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
38507cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
38517cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
38527cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
38537cf1b8d3SKris Buschelman       idt -= 4;
38547cf1b8d3SKris Buschelman     }
38557cf1b8d3SKris Buschelman 
38567cf1b8d3SKris Buschelman   } /* End of artificial scope. */
38571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
38581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3859dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
38607cf1b8d3SKris Buschelman   SSE_SCOPE_END;
38617cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
38627cf1b8d3SKris Buschelman }
38637cf1b8d3SKris Buschelman 
38643660e330SKris Buschelman #endif
38658f690400SShri Abhyankar 
38664a2ae208SSatish Balay #undef __FUNCT__
38674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3868dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
38694e2b4712SSatish Balay {
38704e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
38714e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
38726849ba73SBarry Smith   PetscErrorCode    ierr;
38735d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
38745d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3875d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3876d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3877d9fead3dSBarry Smith   const PetscScalar *b;
38784e2b4712SSatish Balay 
38794e2b4712SSatish Balay   PetscFunctionBegin;
3880d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3882f1af5d2fSBarry Smith   t  = a->solve_work;
38834e2b4712SSatish Balay 
38844e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
38854e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
38864e2b4712SSatish Balay 
38874e2b4712SSatish Balay   /* forward solve the lower triangular */
38884e2b4712SSatish Balay   idx    = 3*(*r++);
3889f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
38904e2b4712SSatish Balay   for (i=1; i<n; i++) {
38914e2b4712SSatish Balay     v     = aa + 9*ai[i];
38924e2b4712SSatish Balay     vi    = aj + ai[i];
38934e2b4712SSatish Balay     nz    = diag[i] - ai[i];
38944e2b4712SSatish Balay     idx   = 3*(*r++);
3895f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
38964e2b4712SSatish Balay     while (nz--) {
38974e2b4712SSatish Balay       idx   = 3*(*vi++);
3898f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3899f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3900f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3901f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
39024e2b4712SSatish Balay       v += 9;
39034e2b4712SSatish Balay     }
39044e2b4712SSatish Balay     idx = 3*i;
3905f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
39064e2b4712SSatish Balay   }
39074e2b4712SSatish Balay   /* backward solve the upper triangular */
39084e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
39094e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
39104e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
39114e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
39124e2b4712SSatish Balay     idt  = 3*i;
3913f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
39144e2b4712SSatish Balay     while (nz--) {
39154e2b4712SSatish Balay       idx   = 3*(*vi++);
3916f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3917f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3918f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3919f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
39204e2b4712SSatish Balay       v += 9;
39214e2b4712SSatish Balay     }
39224e2b4712SSatish Balay     idc = 3*(*c--);
39234e2b4712SSatish Balay     v   = aa + 9*diag[i];
3924f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3925f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3926f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
39274e2b4712SSatish Balay   }
39284e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
39294e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3930d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39311ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3932dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
39334e2b4712SSatish Balay   PetscFunctionReturn(0);
39344e2b4712SSatish Balay }
39354e2b4712SSatish Balay 
39368f690400SShri Abhyankar #undef __FUNCT__
39378f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
39388f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
39398f690400SShri Abhyankar {
39408f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
39418f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
39428f690400SShri Abhyankar   PetscErrorCode    ierr;
394329b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,k,m;
39448f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
39458f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
39468f690400SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
39478f690400SShri Abhyankar   const PetscScalar *b;
39488f690400SShri Abhyankar 
39498f690400SShri Abhyankar   PetscFunctionBegin;
39508f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39518f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
39528f690400SShri Abhyankar   t  = a->solve_work;
39538f690400SShri Abhyankar 
39548f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
395529b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
39568f690400SShri Abhyankar 
39578f690400SShri Abhyankar   /* forward solve the lower triangular */
395829b92fc1SShri Abhyankar   idx    = 3*r[0];
39598f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
39608f690400SShri Abhyankar   for (i=1; i<n; i++) {
39618f690400SShri Abhyankar     v     = aa + 9*ai[i];
39628f690400SShri Abhyankar     vi    = aj + ai[i];
39638f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
396429b92fc1SShri Abhyankar     idx   = 3*r[i];
39658f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
396629b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
396729b92fc1SShri Abhyankar       idx   = 3*vi[m];
39688f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
39698f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
39708f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
39718f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
39728f690400SShri Abhyankar       v += 9;
39738f690400SShri Abhyankar     }
39748f690400SShri Abhyankar     idx = 3*i;
39758f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
39768f690400SShri Abhyankar   }
39778f690400SShri Abhyankar   /* backward solve the upper triangular */
39788f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
39798f690400SShri Abhyankar     k    = 2*n-i;
39808f690400SShri Abhyankar     v    = aa + 9*ai[k];
39818f690400SShri Abhyankar     vi   = aj + ai[k];
39828f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
39838f690400SShri Abhyankar     idt  = 3*i;
39848f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
398529b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
398629b92fc1SShri Abhyankar       idx   = 3*vi[m];
39878f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
39888f690400SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
39898f690400SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
39908f690400SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
39918f690400SShri Abhyankar       v += 9;
39928f690400SShri Abhyankar     }
399329b92fc1SShri Abhyankar     idc = 3*c[i];
39948f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
39958f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
39968f690400SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
39978f690400SShri Abhyankar   }
39988f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
39998f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40008f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40018f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
40028f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
40038f690400SShri Abhyankar   PetscFunctionReturn(0);
40048f690400SShri Abhyankar }
40058f690400SShri Abhyankar 
40060c4413a7SShri Abhyankar #undef __FUNCT__
40070c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct_v2"
40080c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct_v2(Mat A,Vec bb,Vec xx)
40090c4413a7SShri Abhyankar {
40100c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
40110c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
40120c4413a7SShri Abhyankar   PetscErrorCode    ierr;
40130c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
40140c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
40150c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
40160c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
40170c4413a7SShri Abhyankar   const PetscScalar *b;
40180c4413a7SShri Abhyankar 
40190c4413a7SShri Abhyankar   PetscFunctionBegin;
40200c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40210c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
40220c4413a7SShri Abhyankar   t  = a->solve_work;
40230c4413a7SShri Abhyankar 
40240c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
40250c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
40260c4413a7SShri Abhyankar 
40270c4413a7SShri Abhyankar   /* forward solve the lower triangular */
40280c4413a7SShri Abhyankar   idx    = 3*r[0];
40290c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
40300c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
40310c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
40320c4413a7SShri Abhyankar     vi    = aj + ai[i];
40330c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
40340c4413a7SShri Abhyankar     idx   = 3*r[i];
40350c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
40360c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
40370c4413a7SShri Abhyankar       idx   = 3*vi[m];
40380c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
40390c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
40400c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
40410c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40420c4413a7SShri Abhyankar       v += 9;
40430c4413a7SShri Abhyankar     }
40440c4413a7SShri Abhyankar     idx = 3*i;
40450c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
40460c4413a7SShri Abhyankar   }
40470c4413a7SShri Abhyankar   /* backward solve the upper triangular */
40480c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
40490c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
40500c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
40510c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
40520c4413a7SShri Abhyankar     idt  = 3*i;
40530c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
40540c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
40550c4413a7SShri Abhyankar       idx   = 3*vi[m];
40560c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
40570c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
40580c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
40590c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40600c4413a7SShri Abhyankar       v += 9;
40610c4413a7SShri Abhyankar     }
40620c4413a7SShri Abhyankar     idc = 3*c[i];
40630c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
40640c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
40650c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
40660c4413a7SShri Abhyankar   }
40670c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
40680c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40690c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40700c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
40710c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
40720c4413a7SShri Abhyankar   PetscFunctionReturn(0);
40730c4413a7SShri Abhyankar }
40740c4413a7SShri Abhyankar 
407515091d37SBarry Smith /*
407615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
407715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
407815091d37SBarry Smith */
40794a2ae208SSatish Balay #undef __FUNCT__
40804a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4081dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
408215091d37SBarry Smith {
408315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4084690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4085dfbe8321SBarry Smith   PetscErrorCode    ierr;
4086690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4087d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4088d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4089d9fead3dSBarry Smith   const PetscScalar *b;
4090690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
409115091d37SBarry Smith 
409215091d37SBarry Smith   PetscFunctionBegin;
4093d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
409515091d37SBarry Smith 
409615091d37SBarry Smith   /* forward solve the lower triangular */
409715091d37SBarry Smith   idx    = 0;
409815091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
409915091d37SBarry Smith   for (i=1; i<n; i++) {
410015091d37SBarry Smith     v     =  aa      + 9*ai[i];
410115091d37SBarry Smith     vi    =  aj      + ai[i];
410215091d37SBarry Smith     nz    =  diag[i] - ai[i];
410315091d37SBarry Smith     idx   +=  3;
4104f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
410515091d37SBarry Smith     while (nz--) {
410615091d37SBarry Smith       jdx   = 3*(*vi++);
410715091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4108f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4109f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4110f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
411115091d37SBarry Smith       v    += 9;
411215091d37SBarry Smith     }
4113f1af5d2fSBarry Smith     x[idx]   = s1;
4114f1af5d2fSBarry Smith     x[1+idx] = s2;
4115f1af5d2fSBarry Smith     x[2+idx] = s3;
411615091d37SBarry Smith   }
411715091d37SBarry Smith   /* backward solve the upper triangular */
411815091d37SBarry Smith   for (i=n-1; i>=0; i--){
411915091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
412015091d37SBarry Smith     vi   = aj + diag[i] + 1;
412115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
412215091d37SBarry Smith     idt  = 3*i;
4123f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4124f1af5d2fSBarry Smith     s3 = x[2+idt];
412515091d37SBarry Smith     while (nz--) {
412615091d37SBarry Smith       idx   = 3*(*vi++);
412715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4128f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4129f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4130f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
413115091d37SBarry Smith       v    += 9;
413215091d37SBarry Smith     }
413315091d37SBarry Smith     v        = aa +  9*diag[i];
4134f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4135f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4136f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
413715091d37SBarry Smith   }
413815091d37SBarry Smith 
4139d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4141dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
414215091d37SBarry Smith   PetscFunctionReturn(0);
414315091d37SBarry Smith }
414415091d37SBarry Smith 
41454a2ae208SSatish Balay #undef __FUNCT__
4146cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4147cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4148cee9d6f2SShri Abhyankar {
4149cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4150ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
4151cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4152cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
4153cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4154cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4155cee9d6f2SShri Abhyankar     PetscScalar       *x;
4156cee9d6f2SShri Abhyankar     const PetscScalar *b;
4157cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4158cee9d6f2SShri Abhyankar 
4159cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4160cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4161cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4162cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4163cee9d6f2SShri Abhyankar     idx    = 0;
4164cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4165cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4166cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
4167cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4168cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4169cee9d6f2SShri Abhyankar       idx   = bs*i;
4170cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4171ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4172ce3d78c0SShri Abhyankar          jdx   = bs*vi[k];
4173cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4174cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4175cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4176cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4177cee9d6f2SShri Abhyankar 
4178cee9d6f2SShri Abhyankar           v   +=  bs2;
4179cee9d6f2SShri Abhyankar         }
4180cee9d6f2SShri Abhyankar 
4181cee9d6f2SShri Abhyankar        x[idx]   = s1;
4182cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4183cee9d6f2SShri Abhyankar        x[2+idx] = s3;
4184cee9d6f2SShri Abhyankar     }
4185cee9d6f2SShri Abhyankar 
4186cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4187cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4188cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
4189cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4190cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4191cee9d6f2SShri Abhyankar      idt = bs*i;
4192cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4193cee9d6f2SShri Abhyankar 
4194ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4195ce3d78c0SShri Abhyankar        idx   = bs*vi[k];
4196cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4197cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4198cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4199cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4200cee9d6f2SShri Abhyankar 
4201cee9d6f2SShri Abhyankar         v   +=  bs2;
4202cee9d6f2SShri Abhyankar     }
4203cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4204cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4205cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4206cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4207cee9d6f2SShri Abhyankar 
4208cee9d6f2SShri Abhyankar   }
4209cee9d6f2SShri Abhyankar 
4210cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4211cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4212cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4213cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4214cee9d6f2SShri Abhyankar }
4215cee9d6f2SShri Abhyankar 
4216cee9d6f2SShri Abhyankar #undef __FUNCT__
4217b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2"
4218b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4219b2b2dd24SShri Abhyankar {
4220b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4221b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4222b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4223b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
4224b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4225b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4226b2b2dd24SShri Abhyankar     PetscScalar       *x;
4227b2b2dd24SShri Abhyankar     const PetscScalar *b;
4228b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4229b2b2dd24SShri Abhyankar 
4230b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4231b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4232b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4233b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4234b2b2dd24SShri Abhyankar     idx    = 0;
4235b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4236b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4237b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4238b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4239b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4240b2b2dd24SShri Abhyankar       idx   = bs*i;
4241b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4242b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4243b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4244b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4245b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4246b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4247b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4248b2b2dd24SShri Abhyankar 
4249b2b2dd24SShri Abhyankar           v   +=  bs2;
4250b2b2dd24SShri Abhyankar         }
4251b2b2dd24SShri Abhyankar 
4252b2b2dd24SShri Abhyankar        x[idx]   = s1;
4253b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4254b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4255b2b2dd24SShri Abhyankar     }
4256b2b2dd24SShri Abhyankar 
4257b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4258b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4259b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4260b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4261b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4262b2b2dd24SShri Abhyankar      idt = bs*i;
4263b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4264b2b2dd24SShri Abhyankar 
4265b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4266b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4267b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4268b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4269b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4270b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4271b2b2dd24SShri Abhyankar 
4272b2b2dd24SShri Abhyankar         v   +=  bs2;
4273b2b2dd24SShri Abhyankar     }
4274b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4275b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4276b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4277b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4278b2b2dd24SShri Abhyankar 
4279b2b2dd24SShri Abhyankar   }
4280b2b2dd24SShri Abhyankar 
4281b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4282b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4283b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4284b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4285b2b2dd24SShri Abhyankar }
4286b2b2dd24SShri Abhyankar 
4287b2b2dd24SShri Abhyankar #undef __FUNCT__
42884a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4289dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
42904e2b4712SSatish Balay {
42914e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
42924e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
42936849ba73SBarry Smith   PetscErrorCode    ierr;
42945d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
42955d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4296d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4297d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4298d9fead3dSBarry Smith   const PetscScalar *b;
42994e2b4712SSatish Balay 
43004e2b4712SSatish Balay   PetscFunctionBegin;
4301d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43021ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4303f1af5d2fSBarry Smith   t  = a->solve_work;
43044e2b4712SSatish Balay 
43054e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
43064e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
43074e2b4712SSatish Balay 
43084e2b4712SSatish Balay   /* forward solve the lower triangular */
43094e2b4712SSatish Balay   idx    = 2*(*r++);
4310f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
43114e2b4712SSatish Balay   for (i=1; i<n; i++) {
43124e2b4712SSatish Balay     v     = aa + 4*ai[i];
43134e2b4712SSatish Balay     vi    = aj + ai[i];
43144e2b4712SSatish Balay     nz    = diag[i] - ai[i];
43154e2b4712SSatish Balay     idx   = 2*(*r++);
4316f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
43174e2b4712SSatish Balay     while (nz--) {
43184e2b4712SSatish Balay       idx   = 2*(*vi++);
4319f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4320f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4321f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
43224e2b4712SSatish Balay       v += 4;
43234e2b4712SSatish Balay     }
43244e2b4712SSatish Balay     idx = 2*i;
4325f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
43264e2b4712SSatish Balay   }
43274e2b4712SSatish Balay   /* backward solve the upper triangular */
43284e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
43294e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
43304e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
43314e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
43324e2b4712SSatish Balay     idt  = 2*i;
4333f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
43344e2b4712SSatish Balay     while (nz--) {
43354e2b4712SSatish Balay       idx   = 2*(*vi++);
4336f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4337f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4338f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
43394e2b4712SSatish Balay       v += 4;
43404e2b4712SSatish Balay     }
43414e2b4712SSatish Balay     idc = 2*(*c--);
43424e2b4712SSatish Balay     v   = aa + 4*diag[i];
4343f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4344f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
43454e2b4712SSatish Balay   }
43464e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
43474e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4348d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4350dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
43514e2b4712SSatish Balay   PetscFunctionReturn(0);
43524e2b4712SSatish Balay }
43534e2b4712SSatish Balay 
43548f690400SShri Abhyankar #undef __FUNCT__
43558f690400SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
43568f690400SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
43578f690400SShri Abhyankar {
43588f690400SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
43598f690400SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
43608f690400SShri Abhyankar   PetscErrorCode    ierr;
436129b92fc1SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,jdx,idt,idc,k,m;
43628f690400SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
43638f690400SShri Abhyankar   const MatScalar   *aa=a->a,*v;
43648f690400SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
43658f690400SShri Abhyankar   const PetscScalar *b;
43668f690400SShri Abhyankar 
43678f690400SShri Abhyankar   PetscFunctionBegin;
43688f690400SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43698f690400SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
43708f690400SShri Abhyankar   t  = a->solve_work;
43718f690400SShri Abhyankar 
43728f690400SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
437329b92fc1SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
43748f690400SShri Abhyankar 
43758f690400SShri Abhyankar   /* forward solve the lower triangular */
437629b92fc1SShri Abhyankar   idx    = 2*r[0];
43778f690400SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
43788f690400SShri Abhyankar   for (i=1; i<n; i++) {
43798f690400SShri Abhyankar     v     = aa + 4*ai[i];
43808f690400SShri Abhyankar     vi    = aj + ai[i];
43818f690400SShri Abhyankar     nz    = ai[i+1] - ai[i];
438229b92fc1SShri Abhyankar     idx   = 2*r[i];
43838f690400SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
438429b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
438529b92fc1SShri Abhyankar       jdx   = 2*vi[m];
43868f690400SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
43878f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
43888f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
43898f690400SShri Abhyankar       v += 4;
43908f690400SShri Abhyankar     }
43918f690400SShri Abhyankar     idx = 2*i;
43928f690400SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
43938f690400SShri Abhyankar   }
43948f690400SShri Abhyankar   /* backward solve the upper triangular */
43958f690400SShri Abhyankar   for (i=n-1; i>=0; i--){
43968f690400SShri Abhyankar     k = 2*n-i;
43978f690400SShri Abhyankar     v    = aa + 4*ai[k];
43988f690400SShri Abhyankar     vi   = aj + ai[k];
43998f690400SShri Abhyankar     nz   = ai[k +1] - ai[k] - 1;
44008f690400SShri Abhyankar     idt  = 2*i;
44018f690400SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
440229b92fc1SShri Abhyankar     for(m=0;m<nz;m++){
440329b92fc1SShri Abhyankar       idx   = 2*vi[m];
44048f690400SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
44058f690400SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
44068f690400SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
44078f690400SShri Abhyankar       v += 4;
44088f690400SShri Abhyankar     }
440929b92fc1SShri Abhyankar     idc = 2*c[i];
44108f690400SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
44118f690400SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
44128f690400SShri Abhyankar   }
44138f690400SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
44148f690400SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
44158f690400SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44168f690400SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
44178f690400SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
44188f690400SShri Abhyankar   PetscFunctionReturn(0);
44198f690400SShri Abhyankar }
44208f690400SShri Abhyankar 
44210c4413a7SShri Abhyankar #undef __FUNCT__
44220c4413a7SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct_v2"
44230c4413a7SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct_v2(Mat A,Vec bb,Vec xx)
44240c4413a7SShri Abhyankar {
44250c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
44260c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
44270c4413a7SShri Abhyankar   PetscErrorCode    ierr;
44280c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
44290c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
44300c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
44310c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
44320c4413a7SShri Abhyankar   const PetscScalar *b;
44330c4413a7SShri Abhyankar 
44340c4413a7SShri Abhyankar   PetscFunctionBegin;
44350c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44360c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
44370c4413a7SShri Abhyankar   t  = a->solve_work;
44380c4413a7SShri Abhyankar 
44390c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
44400c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
44410c4413a7SShri Abhyankar 
44420c4413a7SShri Abhyankar   /* forward solve the lower triangular */
44430c4413a7SShri Abhyankar   idx    = 2*r[0];
44440c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
44450c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
44460c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
44470c4413a7SShri Abhyankar     vi    = aj + ai[i];
44480c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
44490c4413a7SShri Abhyankar     idx   = 2*r[i];
44500c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
44510c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
44520c4413a7SShri Abhyankar       jdx   = 2*vi[m];
44530c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
44540c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
44550c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
44560c4413a7SShri Abhyankar       v += 4;
44570c4413a7SShri Abhyankar     }
44580c4413a7SShri Abhyankar     idx = 2*i;
44590c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
44600c4413a7SShri Abhyankar   }
44610c4413a7SShri Abhyankar   /* backward solve the upper triangular */
44620c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
44630c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
44640c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
44650c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
44660c4413a7SShri Abhyankar     idt  = 2*i;
44670c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
44680c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
44690c4413a7SShri Abhyankar       idx   = 2*vi[m];
44700c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
44710c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
44720c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
44730c4413a7SShri Abhyankar       v += 4;
44740c4413a7SShri Abhyankar     }
44750c4413a7SShri Abhyankar     idc = 2*c[i];
44760c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
44770c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
44780c4413a7SShri Abhyankar   }
44790c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
44800c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
44810c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44820c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
44830c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
44840c4413a7SShri Abhyankar   PetscFunctionReturn(0);
44850c4413a7SShri Abhyankar }
44868f690400SShri Abhyankar 
448715091d37SBarry Smith /*
448815091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
448915091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
449015091d37SBarry Smith */
44914a2ae208SSatish Balay #undef __FUNCT__
44924a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4493dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
449415091d37SBarry Smith {
449515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4496690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4497dfbe8321SBarry Smith   PetscErrorCode    ierr;
4498690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4499d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4500d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
4501d9fead3dSBarry Smith   const PetscScalar *b;
4502690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
450315091d37SBarry Smith 
450415091d37SBarry Smith   PetscFunctionBegin;
4505d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45061ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
450715091d37SBarry Smith 
450815091d37SBarry Smith   /* forward solve the lower triangular */
450915091d37SBarry Smith   idx    = 0;
451015091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
451115091d37SBarry Smith   for (i=1; i<n; i++) {
451215091d37SBarry Smith     v     =  aa      + 4*ai[i];
451315091d37SBarry Smith     vi    =  aj      + ai[i];
451415091d37SBarry Smith     nz    =  diag[i] - ai[i];
451515091d37SBarry Smith     idx   +=  2;
4516f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
451715091d37SBarry Smith     while (nz--) {
451815091d37SBarry Smith       jdx   = 2*(*vi++);
451915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
4520f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4521f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
452215091d37SBarry Smith       v    += 4;
452315091d37SBarry Smith     }
4524f1af5d2fSBarry Smith     x[idx]   = s1;
4525f1af5d2fSBarry Smith     x[1+idx] = s2;
452615091d37SBarry Smith   }
452715091d37SBarry Smith   /* backward solve the upper triangular */
452815091d37SBarry Smith   for (i=n-1; i>=0; i--){
452915091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
453015091d37SBarry Smith     vi   = aj + diag[i] + 1;
453115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
453215091d37SBarry Smith     idt  = 2*i;
4533f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
453415091d37SBarry Smith     while (nz--) {
453515091d37SBarry Smith       idx   = 2*(*vi++);
453615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
4537f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4538f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
453915091d37SBarry Smith       v    += 4;
454015091d37SBarry Smith     }
454115091d37SBarry Smith     v        = aa +  4*diag[i];
4542f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
4543f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
454415091d37SBarry Smith   }
454515091d37SBarry Smith 
4546d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45471ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4548dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
454915091d37SBarry Smith   PetscFunctionReturn(0);
455015091d37SBarry Smith }
455115091d37SBarry Smith 
45524a2ae208SSatish Balay #undef __FUNCT__
4553cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4554cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4555cee9d6f2SShri Abhyankar {
4556cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4557ce3d78c0SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
4558cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
4559cee9d6f2SShri Abhyankar     PetscInt          jdx;
4560cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4561cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4562cee9d6f2SShri Abhyankar     const PetscScalar *b;
4563cee9d6f2SShri Abhyankar 
4564cee9d6f2SShri Abhyankar     PetscFunctionBegin;
4565cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4566cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4567cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
4568cee9d6f2SShri Abhyankar     idx    = 0;
4569cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4570cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
4571cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
4572cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
4573cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
4574cee9d6f2SShri Abhyankar        idx  = 2*i;
4575cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4576ce3d78c0SShri Abhyankar       for(k=0;k<nz;k++){
4577ce3d78c0SShri Abhyankar          jdx   = 2*vi[k];
4578cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4579cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4580cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4581cee9d6f2SShri Abhyankar            v   +=  4;
4582cee9d6f2SShri Abhyankar         }
4583cee9d6f2SShri Abhyankar        x[idx]   = s1;
4584cee9d6f2SShri Abhyankar        x[1+idx] = s2;
4585cee9d6f2SShri Abhyankar     }
4586cee9d6f2SShri Abhyankar 
4587cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
4588cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
4589cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
4590cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
4591cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
4592cee9d6f2SShri Abhyankar      idt = 2*i;
4593cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4594ce3d78c0SShri Abhyankar      for(k=0;k<nz;k++){
4595ce3d78c0SShri Abhyankar       idx   = 2*vi[k];
4596cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4597cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4598cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4599cee9d6f2SShri Abhyankar          v    += 4;
4600cee9d6f2SShri Abhyankar     }
4601cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
4602cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4603cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4604cee9d6f2SShri Abhyankar   }
4605cee9d6f2SShri Abhyankar 
4606cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4607cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4608cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4609cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
4610cee9d6f2SShri Abhyankar }
4611cee9d6f2SShri Abhyankar 
4612cee9d6f2SShri Abhyankar #undef __FUNCT__
4613b2b2dd24SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2"
4614b2b2dd24SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct_v2(Mat A,Vec bb,Vec xx)
4615b2b2dd24SShri Abhyankar {
4616b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4617b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4618b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4619b2b2dd24SShri Abhyankar     PetscInt          jdx;
4620b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4621b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4622b2b2dd24SShri Abhyankar     const PetscScalar *b;
4623b2b2dd24SShri Abhyankar 
4624b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4625b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4626b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4627b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4628b2b2dd24SShri Abhyankar     idx    = 0;
4629b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4630b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4631b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4632b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4633b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4634b2b2dd24SShri Abhyankar        idx  = 2*i;
4635b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4636b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4637b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4638b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4639b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4640b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4641b2b2dd24SShri Abhyankar            v   +=  4;
4642b2b2dd24SShri Abhyankar         }
4643b2b2dd24SShri Abhyankar        x[idx]   = s1;
4644b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4645b2b2dd24SShri Abhyankar     }
4646b2b2dd24SShri Abhyankar 
4647b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4648b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4649b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4650b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4651b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4652b2b2dd24SShri Abhyankar      idt = 2*i;
4653b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4654b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4655b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4656b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4657b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4658b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4659b2b2dd24SShri Abhyankar          v    += 4;
4660b2b2dd24SShri Abhyankar     }
4661b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4662b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4663b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4664b2b2dd24SShri Abhyankar   }
4665b2b2dd24SShri Abhyankar 
4666b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4667b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4668b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4669b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4670b2b2dd24SShri Abhyankar }
4671b2b2dd24SShri Abhyankar 
4672b2b2dd24SShri Abhyankar #undef __FUNCT__
46734a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4674dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
46754e2b4712SSatish Balay {
46764e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
46774e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
46786849ba73SBarry Smith   PetscErrorCode ierr;
46795d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
46805d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
46813f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
468287828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
46834e2b4712SSatish Balay 
46844e2b4712SSatish Balay   PetscFunctionBegin;
46854e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
46864e2b4712SSatish Balay 
46871ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
46881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4689f1af5d2fSBarry Smith   t  = a->solve_work;
46904e2b4712SSatish Balay 
46914e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
46924e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
46934e2b4712SSatish Balay 
46944e2b4712SSatish Balay   /* forward solve the lower triangular */
4695f1af5d2fSBarry Smith   t[0] = b[*r++];
46964e2b4712SSatish Balay   for (i=1; i<n; i++) {
46974e2b4712SSatish Balay     v     = aa + ai[i];
46984e2b4712SSatish Balay     vi    = aj + ai[i];
46994e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4700f1af5d2fSBarry Smith     s1  = b[*r++];
47014e2b4712SSatish Balay     while (nz--) {
4702f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
47034e2b4712SSatish Balay     }
4704f1af5d2fSBarry Smith     t[i] = s1;
47054e2b4712SSatish Balay   }
47064e2b4712SSatish Balay   /* backward solve the upper triangular */
47074e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
47084e2b4712SSatish Balay     v    = aa + diag[i] + 1;
47094e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
47104e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4711f1af5d2fSBarry Smith     s1 = t[i];
47124e2b4712SSatish Balay     while (nz--) {
4713f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
47144e2b4712SSatish Balay     }
4715f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
47164e2b4712SSatish Balay   }
47174e2b4712SSatish Balay 
47184e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
47194e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
47201ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
47211ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4722dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
47234e2b4712SSatish Balay   PetscFunctionReturn(0);
47244e2b4712SSatish Balay }
472515091d37SBarry Smith /*
472615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
472715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
472815091d37SBarry Smith */
47294a2ae208SSatish Balay #undef __FUNCT__
47304a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4731dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
473215091d37SBarry Smith {
473315091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4734690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4735dfbe8321SBarry Smith   PetscErrorCode ierr;
4736690b6cddSBarry Smith   PetscInt       *diag = a->diag;
473715091d37SBarry Smith   MatScalar      *aa=a->a;
473887828ca2SBarry Smith   PetscScalar    *x,*b;
473987828ca2SBarry Smith   PetscScalar    s1,x1;
474015091d37SBarry Smith   MatScalar      *v;
4741690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
474215091d37SBarry Smith 
474315091d37SBarry Smith   PetscFunctionBegin;
47441ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
47451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
474615091d37SBarry Smith 
474715091d37SBarry Smith   /* forward solve the lower triangular */
474815091d37SBarry Smith   idx    = 0;
474915091d37SBarry Smith   x[0]   = b[0];
475015091d37SBarry Smith   for (i=1; i<n; i++) {
475115091d37SBarry Smith     v     =  aa      + ai[i];
475215091d37SBarry Smith     vi    =  aj      + ai[i];
475315091d37SBarry Smith     nz    =  diag[i] - ai[i];
475415091d37SBarry Smith     idx   +=  1;
4755f1af5d2fSBarry Smith     s1  =  b[idx];
475615091d37SBarry Smith     while (nz--) {
475715091d37SBarry Smith       jdx   = *vi++;
475815091d37SBarry Smith       x1    = x[jdx];
4759f1af5d2fSBarry Smith       s1 -= v[0]*x1;
476015091d37SBarry Smith       v    += 1;
476115091d37SBarry Smith     }
4762f1af5d2fSBarry Smith     x[idx]   = s1;
476315091d37SBarry Smith   }
476415091d37SBarry Smith   /* backward solve the upper triangular */
476515091d37SBarry Smith   for (i=n-1; i>=0; i--){
476615091d37SBarry Smith     v    = aa + diag[i] + 1;
476715091d37SBarry Smith     vi   = aj + diag[i] + 1;
476815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
476915091d37SBarry Smith     idt  = i;
4770f1af5d2fSBarry Smith     s1 = x[idt];
477115091d37SBarry Smith     while (nz--) {
477215091d37SBarry Smith       idx   = *vi++;
477315091d37SBarry Smith       x1    = x[idx];
4774f1af5d2fSBarry Smith       s1 -= v[0]*x1;
477515091d37SBarry Smith       v    += 1;
477615091d37SBarry Smith     }
477715091d37SBarry Smith     v        = aa +  diag[i];
4778f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
477915091d37SBarry Smith   }
47801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
47811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4782dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
478315091d37SBarry Smith   PetscFunctionReturn(0);
478415091d37SBarry Smith }
47854e2b4712SSatish Balay 
47864e2b4712SSatish Balay /* ----------------------------------------------------------------*/
478716a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
47886bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
47896bce7ff8SHong Zhang 
47906bce7ff8SHong Zhang #undef __FUNCT__
47916bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
47926bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
47936bce7ff8SHong Zhang {
47946bce7ff8SHong Zhang   Mat            C=B;
47956bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
47966bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
47976bce7ff8SHong Zhang   PetscErrorCode ierr;
47986bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
47996bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
48006bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4801b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4802914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4803914a18a2SHong Zhang   MatScalar      *v_work;
48046bce7ff8SHong Zhang 
48056bce7ff8SHong Zhang   PetscFunctionBegin;
48066bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
48076bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4808914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4809914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
48106bce7ff8SHong Zhang   ics  = ic;
48116bce7ff8SHong Zhang 
4812914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
4813914a18a2SHong Zhang   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
4814b588c5a2SHong Zhang   mwork    = v_work + bs;
4815b588c5a2SHong Zhang   v_pivots = (PetscInt*)(mwork + bs2);
4816914a18a2SHong Zhang 
48176bce7ff8SHong Zhang   for (i=0; i<n; i++){
48186bce7ff8SHong Zhang     /* zero rtmp */
48196bce7ff8SHong Zhang     /* L part */
48206bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
48216bce7ff8SHong Zhang     bjtmp = bj + bi[i];
4822914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4823914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4824914a18a2SHong Zhang     }
48256bce7ff8SHong Zhang 
48266bce7ff8SHong Zhang     /* U part */
48276bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
48286bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
4829914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4830914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4831914a18a2SHong Zhang     }
48326bce7ff8SHong Zhang 
48336bce7ff8SHong Zhang     /* load in initial (unfactored row) */
48346bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
48356bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
4836914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
48376bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4838914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
48396bce7ff8SHong Zhang     }
48406bce7ff8SHong Zhang 
48416bce7ff8SHong Zhang     /* elimination */
48426bce7ff8SHong Zhang     bjtmp = bj + bi[i];
48436bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
4844b1646270SShri Abhyankar     for(k=0;k < nzL;k++) {
4845b1646270SShri Abhyankar       row = bjtmp[k];
4846914a18a2SHong Zhang       pc = rtmp + bs2*row;
4847914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
4848914a18a2SHong Zhang       if (flg) {
4849914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
4850b588c5a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
48516bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
4852914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
48536bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
4854914a18a2SHong Zhang         for (j=0; j<nz; j++) {
4855914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
4856914a18a2SHong Zhang         }
4857b588c5a2SHong Zhang         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
48586bce7ff8SHong Zhang       }
48596bce7ff8SHong Zhang     }
48606bce7ff8SHong Zhang 
48616bce7ff8SHong Zhang     /* finished row so stick it into b->a */
48626bce7ff8SHong Zhang     /* L part */
4863914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
48646bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
48656bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
48666bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
4867914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
48686bce7ff8SHong Zhang     }
48696bce7ff8SHong Zhang 
48706bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
4871914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
48726bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
4873914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
4874914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4875914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
48766bce7ff8SHong Zhang 
48776bce7ff8SHong Zhang     /* U part */
4878914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
48796bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
48806bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
4881914a18a2SHong Zhang     for (j=0; j<nz; j++){
4882914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4883914a18a2SHong Zhang     }
48846bce7ff8SHong Zhang   }
48856bce7ff8SHong Zhang 
48866bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
48876bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
48886bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
48896bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
489027019359SHong Zhang 
48916bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
4892914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
48936bce7ff8SHong Zhang   PetscFunctionReturn(0);
48946bce7ff8SHong Zhang }
48956bce7ff8SHong Zhang 
48961a83e813SShri Abhyankar #undef __FUNCT__
48971a83e813SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2"
48981a83e813SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct_v2(Mat B,Mat A,const MatFactorInfo *info)
48991a83e813SShri Abhyankar {
49001a83e813SShri Abhyankar   Mat            C=B;
49011a83e813SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
49021a83e813SShri Abhyankar   IS             isrow = b->row,isicol = b->icol;
49031a83e813SShri Abhyankar   PetscErrorCode ierr;
49041a83e813SShri Abhyankar   const PetscInt *r,*ic,*ics;
49051a83e813SShri Abhyankar   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
49061a83e813SShri Abhyankar   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
49071a83e813SShri Abhyankar   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
49081a83e813SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
49091a83e813SShri Abhyankar   MatScalar      *v_work;
49101a83e813SShri Abhyankar 
49111a83e813SShri Abhyankar   PetscFunctionBegin;
49121a83e813SShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
49131a83e813SShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
49141a83e813SShri Abhyankar   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
49151a83e813SShri Abhyankar   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
49161a83e813SShri Abhyankar   ics  = ic;
49171a83e813SShri Abhyankar 
49181a83e813SShri Abhyankar   /* generate work space needed by dense LU factorization */
49191a83e813SShri Abhyankar   ierr     = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
49201a83e813SShri Abhyankar   mwork    = v_work + bs;
49211a83e813SShri Abhyankar   v_pivots = (PetscInt*)(mwork + bs2);
49221a83e813SShri Abhyankar 
49231a83e813SShri Abhyankar   for (i=0; i<n; i++){
49241a83e813SShri Abhyankar     /* zero rtmp */
49251a83e813SShri Abhyankar     /* L part */
49261a83e813SShri Abhyankar     nz    = bi[i+1] - bi[i];
49271a83e813SShri Abhyankar     bjtmp = bj + bi[i];
49281a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
49291a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
49301a83e813SShri Abhyankar     }
49311a83e813SShri Abhyankar 
49321a83e813SShri Abhyankar     /* U part */
49331a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
49341a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
49351a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
49361a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
49371a83e813SShri Abhyankar     }
49381a83e813SShri Abhyankar 
49391a83e813SShri Abhyankar     /* load in initial (unfactored row) */
49401a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
49411a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
49421a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
49431a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
49441a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
49451a83e813SShri Abhyankar     }
49461a83e813SShri Abhyankar 
49471a83e813SShri Abhyankar     /* elimination */
49481a83e813SShri Abhyankar     bjtmp = bj + bi[i];
49491a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
49501a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
49511a83e813SShri Abhyankar       row = bjtmp[k];
49521a83e813SShri Abhyankar       pc = rtmp + bs2*row;
49531a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
49541a83e813SShri Abhyankar       if (flg) {
49551a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
49561a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
49571a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
49581a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
49591a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
49601a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
49611a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
49621a83e813SShri Abhyankar         }
49631a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
49641a83e813SShri Abhyankar       }
49651a83e813SShri Abhyankar     }
49661a83e813SShri Abhyankar 
49671a83e813SShri Abhyankar     /* finished row so stick it into b->a */
49681a83e813SShri Abhyankar     /* L part */
49691a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
49701a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
49711a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
49721a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
49731a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
49741a83e813SShri Abhyankar     }
49751a83e813SShri Abhyankar 
49761a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
49771a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
49781a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
49791a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
49801a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
49811a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
49821a83e813SShri Abhyankar 
49831a83e813SShri Abhyankar     /* U part */
49841a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
49851a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
49861a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
49871a83e813SShri Abhyankar     for (j=0; j<nz; j++){
49881a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
49891a83e813SShri Abhyankar     }
49901a83e813SShri Abhyankar   }
49911a83e813SShri Abhyankar 
49921a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
49931a83e813SShri Abhyankar   ierr = PetscFree(v_work);CHKERRQ(ierr);
49941a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
49951a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
49961a83e813SShri Abhyankar 
49971a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
49981a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
49991a83e813SShri Abhyankar   PetscFunctionReturn(0);
50001a83e813SShri Abhyankar }
50011a83e813SShri Abhyankar 
50026bce7ff8SHong Zhang /*
50036bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
500416a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
500516a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
50066bce7ff8SHong Zhang */
50076bce7ff8SHong Zhang #undef __FUNCT__
50086bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
50096bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
50106bce7ff8SHong Zhang {
50116bce7ff8SHong Zhang 
50126bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
50136bce7ff8SHong Zhang   PetscErrorCode     ierr;
501416a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
501516a2bf60SHong Zhang   PetscInt           i,j,nz,*bi,*bj,*bdiag;
50166bce7ff8SHong Zhang 
50176bce7ff8SHong Zhang   PetscFunctionBegin;
501816a2bf60SHong Zhang   /* printf("MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct...\n"); */
501916a2bf60SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
50206bce7ff8SHong Zhang   b    = (Mat_SeqBAIJ*)(fact)->data;
502116a2bf60SHong Zhang 
502216a2bf60SHong Zhang   /* allocate matrix arrays for new data structure */
502316a2bf60SHong Zhang   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,2*n+2,PetscInt,&b->i);CHKERRQ(ierr);
502416a2bf60SHong Zhang   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(2*n+2)*sizeof(PetscInt));CHKERRQ(ierr);
502516a2bf60SHong Zhang   b->singlemalloc = PETSC_TRUE;
502616a2bf60SHong Zhang   if (!b->diag){
502716a2bf60SHong Zhang     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
502816a2bf60SHong Zhang   }
5029914a18a2SHong Zhang   bdiag = b->diag;
50306bce7ff8SHong Zhang 
503116a2bf60SHong Zhang   if (n > 0) {
503216a2bf60SHong Zhang     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
50336bce7ff8SHong Zhang   }
50346bce7ff8SHong Zhang 
50356bce7ff8SHong Zhang   /* set bi and bj with new data structure */
50366bce7ff8SHong Zhang   bi = b->i;
50376bce7ff8SHong Zhang   bj = b->j;
50386bce7ff8SHong Zhang 
50396bce7ff8SHong Zhang   /* L part */
50406bce7ff8SHong Zhang   bi[0] = 0;
504116a2bf60SHong Zhang   for (i=0; i<n; i++){
50426bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
5043914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
50446bce7ff8SHong Zhang     aj = a->j + ai[i];
50456bce7ff8SHong Zhang     for (j=0; j<nz; j++){
50466bce7ff8SHong Zhang       *bj = aj[j]; bj++;
50476bce7ff8SHong Zhang     }
50486bce7ff8SHong Zhang   }
50496bce7ff8SHong Zhang 
50506bce7ff8SHong Zhang   /* U part */
505116a2bf60SHong Zhang   bi[n+1] = bi[n];
505216a2bf60SHong Zhang   for (i=n-1; i>=0; i--){
50536bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
505416a2bf60SHong Zhang     bi[2*n-i+1] = bi[2*n-i] + nz + 1;
50556bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
50566bce7ff8SHong Zhang     for (j=0; j<nz; j++){
50576bce7ff8SHong Zhang       *bj = aj[j]; bj++;
50586bce7ff8SHong Zhang     }
50596bce7ff8SHong Zhang     /* diag[i] */
50606bce7ff8SHong Zhang     *bj = i; bj++;
506116a2bf60SHong Zhang     bdiag[i] = bi[2*n-i+1]-1;
50626bce7ff8SHong Zhang   }
50636bce7ff8SHong Zhang   PetscFunctionReturn(0);
50646bce7ff8SHong Zhang }
50656bce7ff8SHong Zhang 
506616a2bf60SHong Zhang #undef __FUNCT__
506716a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
506816a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
506916a2bf60SHong Zhang {
507016a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
507116a2bf60SHong Zhang   IS                 isicol;
507216a2bf60SHong Zhang   PetscErrorCode     ierr;
507316a2bf60SHong Zhang   const PetscInt     *r,*ic;
50747fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
507516a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
507616a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
507716a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
50787fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
507916a2bf60SHong Zhang   PetscReal          f;
508016a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
508116a2bf60SHong Zhang   PetscBT            lnkbt;
508216a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
508316a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
508416a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
508516a2bf60SHong Zhang   PetscTruth         missing;
50867fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
508716a2bf60SHong Zhang 
508816a2bf60SHong Zhang   PetscFunctionBegin;
508916a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
509016a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
509116a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
509216a2bf60SHong Zhang 
509316a2bf60SHong Zhang   f             = info->fill;
509416a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
509516a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
509616a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
509716a2bf60SHong Zhang 
509816a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
509916a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
51007fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
510116a2bf60SHong Zhang 
51027fa3a6a0SHong Zhang   if (!levels && both_identity) {
510316a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
510416a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
510516a2bf60SHong Zhang     (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
51067fa3a6a0SHong Zhang     /* set MatSolve routines */
51077fa3a6a0SHong Zhang     switch (bs){
51087fa3a6a0SHong Zhang     case 2:
51097fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
51107fa3a6a0SHong Zhang       break;
51117fa3a6a0SHong Zhang     case 3:
51127fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
51137fa3a6a0SHong Zhang       break;
51147fa3a6a0SHong Zhang     case 4:
51157fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
51167fa3a6a0SHong Zhang       break;
51177fa3a6a0SHong Zhang     case 5:
51187fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
51197fa3a6a0SHong Zhang       break;
51207fa3a6a0SHong Zhang     case 6:
51217fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
51227fa3a6a0SHong Zhang       break;
51237fa3a6a0SHong Zhang     case 7:
51247fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
51257fa3a6a0SHong Zhang       break;
51267fa3a6a0SHong Zhang     default:
51277fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
51287fa3a6a0SHong Zhang       break;
51297fa3a6a0SHong Zhang     }
513016a2bf60SHong Zhang 
513116a2bf60SHong Zhang     fact->factor = MAT_FACTOR_ILU;
513216a2bf60SHong Zhang     (fact)->info.factor_mallocs    = 0;
513316a2bf60SHong Zhang     (fact)->info.fill_ratio_given  = info->fill;
513416a2bf60SHong Zhang     (fact)->info.fill_ratio_needed = 1.0;
513516a2bf60SHong Zhang     b                = (Mat_SeqBAIJ*)(fact)->data;
513616a2bf60SHong Zhang     b->row           = isrow;
513716a2bf60SHong Zhang     b->col           = iscol;
513816a2bf60SHong Zhang     b->icol          = isicol;
513916a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
514016a2bf60SHong Zhang     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
514116a2bf60SHong Zhang     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5142b588c5a2SHong Zhang     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
514316a2bf60SHong Zhang     PetscFunctionReturn(0);
514416a2bf60SHong Zhang   }
514516a2bf60SHong Zhang 
514616a2bf60SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
514716a2bf60SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
514816a2bf60SHong Zhang 
514916a2bf60SHong Zhang   /* get new row pointers */
515016a2bf60SHong Zhang   ierr = PetscMalloc((2*n+2)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
515116a2bf60SHong Zhang   bi[0] = 0;
515216a2bf60SHong Zhang   /* bdiag is location of diagonal in factor */
515316a2bf60SHong Zhang   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
515416a2bf60SHong Zhang   bdiag[0]  = 0;
515516a2bf60SHong Zhang 
515616a2bf60SHong Zhang   ierr = PetscMalloc((2*n+1)*sizeof(PetscInt**),&bj_ptr);CHKERRQ(ierr);
515716a2bf60SHong Zhang   bjlvl_ptr = (PetscInt**)(bj_ptr + n);
515816a2bf60SHong Zhang 
515916a2bf60SHong Zhang   /* create a linked list for storing column indices of the active row */
516016a2bf60SHong Zhang   nlnk = n + 1;
516116a2bf60SHong Zhang   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
516216a2bf60SHong Zhang 
516316a2bf60SHong Zhang   /* initial FreeSpace size is f*(ai[n]+1) */
516416a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
516516a2bf60SHong Zhang   current_space = free_space;
516616a2bf60SHong Zhang   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
516716a2bf60SHong Zhang   current_space_lvl = free_space_lvl;
516816a2bf60SHong Zhang 
516916a2bf60SHong Zhang   for (i=0; i<n; i++) {
517016a2bf60SHong Zhang     nzi = 0;
517116a2bf60SHong Zhang     /* copy current row into linked list */
517216a2bf60SHong Zhang     nnz  = ai[r[i]+1] - ai[r[i]];
517316a2bf60SHong Zhang     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
517416a2bf60SHong Zhang     cols = aj + ai[r[i]];
517516a2bf60SHong Zhang     lnk[i] = -1; /* marker to indicate if diagonal exists */
517616a2bf60SHong Zhang     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
517716a2bf60SHong Zhang     nzi += nlnk;
517816a2bf60SHong Zhang 
517916a2bf60SHong Zhang     /* make sure diagonal entry is included */
518016a2bf60SHong Zhang     if (diagonal_fill && lnk[i] == -1) {
518116a2bf60SHong Zhang       fm = n;
518216a2bf60SHong Zhang       while (lnk[fm] < i) fm = lnk[fm];
518316a2bf60SHong Zhang       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
518416a2bf60SHong Zhang       lnk[fm]    = i;
518516a2bf60SHong Zhang       lnk_lvl[i] = 0;
518616a2bf60SHong Zhang       nzi++; dcount++;
518716a2bf60SHong Zhang     }
518816a2bf60SHong Zhang 
518916a2bf60SHong Zhang     /* add pivot rows into the active row */
519016a2bf60SHong Zhang     nzbd = 0;
519116a2bf60SHong Zhang     prow = lnk[n];
519216a2bf60SHong Zhang     while (prow < i) {
519316a2bf60SHong Zhang       nnz      = bdiag[prow];
519416a2bf60SHong Zhang       cols     = bj_ptr[prow] + nnz + 1;
519516a2bf60SHong Zhang       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
519616a2bf60SHong Zhang       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
519716a2bf60SHong Zhang       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
519816a2bf60SHong Zhang       nzi += nlnk;
519916a2bf60SHong Zhang       prow = lnk[prow];
520016a2bf60SHong Zhang       nzbd++;
520116a2bf60SHong Zhang     }
520216a2bf60SHong Zhang     bdiag[i] = nzbd;
520316a2bf60SHong Zhang     bi[i+1]  = bi[i] + nzi;
520416a2bf60SHong Zhang 
520516a2bf60SHong Zhang     /* if free space is not available, make more free space */
520616a2bf60SHong Zhang     if (current_space->local_remaining<nzi) {
520716a2bf60SHong Zhang       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
520816a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
520916a2bf60SHong Zhang       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
521016a2bf60SHong Zhang       reallocs++;
521116a2bf60SHong Zhang     }
521216a2bf60SHong Zhang 
521316a2bf60SHong Zhang     /* copy data into free_space and free_space_lvl, then initialize lnk */
521416a2bf60SHong Zhang     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
521516a2bf60SHong Zhang     bj_ptr[i]    = current_space->array;
521616a2bf60SHong Zhang     bjlvl_ptr[i] = current_space_lvl->array;
521716a2bf60SHong Zhang 
521816a2bf60SHong Zhang     /* make sure the active row i has diagonal entry */
521916a2bf60SHong Zhang     if (*(bj_ptr[i]+bdiag[i]) != i) {
522016a2bf60SHong Zhang       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
522116a2bf60SHong Zhang     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
522216a2bf60SHong Zhang     }
522316a2bf60SHong Zhang 
522416a2bf60SHong Zhang     current_space->array           += nzi;
522516a2bf60SHong Zhang     current_space->local_used      += nzi;
522616a2bf60SHong Zhang     current_space->local_remaining -= nzi;
522716a2bf60SHong Zhang     current_space_lvl->array           += nzi;
522816a2bf60SHong Zhang     current_space_lvl->local_used      += nzi;
522916a2bf60SHong Zhang     current_space_lvl->local_remaining -= nzi;
523016a2bf60SHong Zhang   }
523116a2bf60SHong Zhang 
523216a2bf60SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
523316a2bf60SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
523416a2bf60SHong Zhang 
523516a2bf60SHong Zhang   /* destroy list of free space and other temporary arrays */
523616a2bf60SHong Zhang   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
523716a2bf60SHong Zhang 
523816a2bf60SHong Zhang   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
5239783ef271SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
524016a2bf60SHong Zhang 
524116a2bf60SHong Zhang   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
524216a2bf60SHong Zhang   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
524316a2bf60SHong Zhang   ierr = PetscFree(bj_ptr);CHKERRQ(ierr);
524416a2bf60SHong Zhang 
524516a2bf60SHong Zhang #if defined(PETSC_USE_INFO)
524616a2bf60SHong Zhang   {
524716a2bf60SHong Zhang     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
524816a2bf60SHong Zhang     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
524916a2bf60SHong Zhang     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
525016a2bf60SHong Zhang     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
525116a2bf60SHong Zhang     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
525216a2bf60SHong Zhang     if (diagonal_fill) {
525316a2bf60SHong Zhang       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
525416a2bf60SHong Zhang     }
525516a2bf60SHong Zhang   }
525616a2bf60SHong Zhang #endif
525716a2bf60SHong Zhang 
525816a2bf60SHong Zhang   /* put together the new matrix */
525916a2bf60SHong Zhang   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
526016a2bf60SHong Zhang   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
526116a2bf60SHong Zhang   b = (Mat_SeqBAIJ*)(fact)->data;
526216a2bf60SHong Zhang   b->free_a       = PETSC_TRUE;
526316a2bf60SHong Zhang   b->free_ij      = PETSC_TRUE;
526416a2bf60SHong Zhang   b->singlemalloc = PETSC_FALSE;
52657fa3a6a0SHong Zhang   ierr = PetscMalloc( (bs2*bi[2*n+1] )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
526616a2bf60SHong Zhang   b->j          = bj;
526716a2bf60SHong Zhang   b->i          = bi;
526816a2bf60SHong Zhang   b->diag       = bdiag;
52697f53bb6cSHong Zhang   b->free_diag  = PETSC_TRUE;
527016a2bf60SHong Zhang   b->ilen       = 0;
527116a2bf60SHong Zhang   b->imax       = 0;
527216a2bf60SHong Zhang   b->row        = isrow;
527316a2bf60SHong Zhang   b->col        = iscol;
527416a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
527516a2bf60SHong Zhang   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
527616a2bf60SHong Zhang   b->icol       = isicol;
52777fa3a6a0SHong Zhang   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
527816a2bf60SHong Zhang   /* In b structure:  Free imax, ilen, old a, old j.
527916a2bf60SHong Zhang      Allocate bdiag, solve_work, new a, new j */
52807fa3a6a0SHong Zhang   ierr = PetscLogObjectMemory(fact,bi[2*n+1] * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
528116a2bf60SHong Zhang   b->maxnz = b->nz = bi[2*n+1] ;
528216a2bf60SHong Zhang   (fact)->info.factor_mallocs    = reallocs;
528316a2bf60SHong Zhang   (fact)->info.fill_ratio_given  = f;
528416a2bf60SHong Zhang   (fact)->info.fill_ratio_needed = ((PetscReal)bi[2*n+1])/((PetscReal)ai[n]);
528516a2bf60SHong Zhang   (fact)->ops->lufactornumeric   = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
52867fa3a6a0SHong Zhang   /* set MatSolve routines */
52877fa3a6a0SHong Zhang   if (both_identity){
52887fa3a6a0SHong Zhang     switch (bs){
52897fa3a6a0SHong Zhang     case 2:
52907fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
52917fa3a6a0SHong Zhang       break;
52927fa3a6a0SHong Zhang     case 3:
52937fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
52947fa3a6a0SHong Zhang       break;
52957fa3a6a0SHong Zhang     case 4:
52967fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
52977fa3a6a0SHong Zhang       break;
52987fa3a6a0SHong Zhang     case 5:
52997fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
53007fa3a6a0SHong Zhang       break;
53017fa3a6a0SHong Zhang     case 6:
53027fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
53037fa3a6a0SHong Zhang       break;
53047fa3a6a0SHong Zhang     case 7:
53057fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
53067fa3a6a0SHong Zhang       break;
53077fa3a6a0SHong Zhang     default:
53087fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
53097fa3a6a0SHong Zhang       break;
53107fa3a6a0SHong Zhang     }
53117fa3a6a0SHong Zhang   } else {
53127fa3a6a0SHong Zhang     switch (bs){
53137fa3a6a0SHong Zhang     case 2:
53147fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_2_newdatastruct;
53157fa3a6a0SHong Zhang       break;
53167fa3a6a0SHong Zhang     case 3:
53177fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_3_newdatastruct;
53187fa3a6a0SHong Zhang       break;
53197fa3a6a0SHong Zhang     case 4:
53207fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_4_newdatastruct;
53217fa3a6a0SHong Zhang       break;
53227fa3a6a0SHong Zhang     case 5:
53237fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_5_newdatastruct;
53247fa3a6a0SHong Zhang       break;
53257fa3a6a0SHong Zhang     case 6:
53267fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_6_newdatastruct;
53277fa3a6a0SHong Zhang       break;
53287fa3a6a0SHong Zhang     case 7:
53297fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_7_newdatastruct;
53307fa3a6a0SHong Zhang       break;
53317fa3a6a0SHong Zhang     default:
53327fa3a6a0SHong Zhang       fact->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
53337fa3a6a0SHong Zhang       break;
53347fa3a6a0SHong Zhang     }
53357fa3a6a0SHong Zhang   }
533616a2bf60SHong Zhang   PetscFunctionReturn(0);
533716a2bf60SHong Zhang }
533816a2bf60SHong Zhang 
53394e2b4712SSatish Balay /*
53404e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
53414e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
53424e2b4712SSatish Balay    Not a good example of code reuse.
53434e2b4712SSatish Balay */
53444a2ae208SSatish Balay #undef __FUNCT__
53454a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
53460481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
53474e2b4712SSatish Balay {
53484e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
53494e2b4712SSatish Balay   IS             isicol;
53506849ba73SBarry Smith   PetscErrorCode ierr;
53515d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
53525d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5353a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5354d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
535541df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
5356329f5518SBarry Smith   PetscReal      f;
535716a2bf60SHong Zhang   PetscTruth     newdatastruct=PETSC_FALSE;
53584e2b4712SSatish Balay 
53594e2b4712SSatish Balay   PetscFunctionBegin;
536016a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
536116a2bf60SHong Zhang   if (newdatastruct){
536216a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
536316a2bf60SHong Zhang     PetscFunctionReturn(0);
536416a2bf60SHong Zhang   }
536516a2bf60SHong Zhang 
53666bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
53676bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
53686bce7ff8SHong Zhang 
5369435faa5fSBarry Smith   f             = info->fill;
5370690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
5371690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
53724c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
537316a2bf60SHong Zhang 
5374667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5375667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
53767d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
5377309c388cSBarry Smith 
537841df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
537916a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
53806bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
53816bce7ff8SHong Zhang 
5382719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
5383719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
5384bb3d539aSBarry Smith     b->row       = isrow;
5385bb3d539aSBarry Smith     b->col       = iscol;
5386bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5387bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5388bb3d539aSBarry Smith     b->icol      = isicol;
5389bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5390b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
53916bce7ff8SHong Zhang     PetscFunctionReturn(0);
53926bce7ff8SHong Zhang   }
53936bce7ff8SHong Zhang 
53946bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
53954e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
53964e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
53974e2b4712SSatish Balay 
53984e2b4712SSatish Balay     /* get new row pointers */
5399690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
54004e2b4712SSatish Balay     ainew[0] = 0;
54014e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
5402690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
5403690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
54044e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
5405690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
54064e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
5407690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
54084e2b4712SSatish Balay     /* im is level for each filled value */
5409690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
54104e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
5411690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
54124e2b4712SSatish Balay     dloc[0]  = 0;
54134e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
5414435faa5fSBarry Smith 
5415435faa5fSBarry Smith       /* copy prow into linked list */
54164e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
54173b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
54184e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
54194e2b4712SSatish Balay       fill[n]    = n;
5420435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
54214e2b4712SSatish Balay       while (nz--) {
54224e2b4712SSatish Balay 	fm  = n;
54234e2b4712SSatish Balay 	idx = ic[*xi++];
54244e2b4712SSatish Balay 	do {
54254e2b4712SSatish Balay 	  m  = fm;
54264e2b4712SSatish Balay 	  fm = fill[m];
54274e2b4712SSatish Balay 	} while (fm < idx);
54284e2b4712SSatish Balay 	fill[m]   = idx;
54294e2b4712SSatish Balay 	fill[idx] = fm;
54304e2b4712SSatish Balay 	im[idx]   = 0;
54314e2b4712SSatish Balay       }
5432435faa5fSBarry Smith 
5433435faa5fSBarry Smith       /* make sure diagonal entry is included */
5434435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
5435435faa5fSBarry Smith 	fm = n;
5436435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
5437435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5438435faa5fSBarry Smith 	fill[fm]   = prow;
5439435faa5fSBarry Smith 	im[prow]   = 0;
5440435faa5fSBarry Smith 	nzf++;
5441335d9088SBarry Smith 	dcount++;
5442435faa5fSBarry Smith       }
5443435faa5fSBarry Smith 
54444e2b4712SSatish Balay       nzi = 0;
54454e2b4712SSatish Balay       row = fill[n];
54464e2b4712SSatish Balay       while (row < prow) {
54474e2b4712SSatish Balay 	incrlev = im[row] + 1;
54484e2b4712SSatish Balay 	nz      = dloc[row];
5449435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
54504e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
54514e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
54524e2b4712SSatish Balay 	fm      = row;
54534e2b4712SSatish Balay 	while (nnz-- > 0) {
54544e2b4712SSatish Balay 	  idx = *xi++;
54554e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
54564e2b4712SSatish Balay 	    flev++;
54574e2b4712SSatish Balay 	    continue;
54584e2b4712SSatish Balay 	  }
54594e2b4712SSatish Balay 	  do {
54604e2b4712SSatish Balay 	    m  = fm;
54614e2b4712SSatish Balay 	    fm = fill[m];
54624e2b4712SSatish Balay 	  } while (fm < idx);
54634e2b4712SSatish Balay 	  if (fm != idx) {
54644e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
54654e2b4712SSatish Balay 	    fill[m]   = idx;
54664e2b4712SSatish Balay 	    fill[idx] = fm;
54674e2b4712SSatish Balay 	    fm        = idx;
54684e2b4712SSatish Balay 	    nzf++;
5469ecf371e4SBarry Smith 	  } else {
54704e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
54714e2b4712SSatish Balay 	  }
54724e2b4712SSatish Balay 	  flev++;
54734e2b4712SSatish Balay 	}
54744e2b4712SSatish Balay 	row = fill[row];
54754e2b4712SSatish Balay 	nzi++;
54764e2b4712SSatish Balay       }
54774e2b4712SSatish Balay       /* copy new filled row into permanent storage */
54784e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
54794e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
5480ecf371e4SBarry Smith 
5481ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
5482ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5483ecf371e4SBarry Smith 	/* just double the memory each time */
5484690b6cddSBarry Smith 	PetscInt maxadd = jmax;
5485ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
54864e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
54874e2b4712SSatish Balay 	jmax += maxadd;
5488ecf371e4SBarry Smith 
5489ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
54905d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
54915d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5492606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
54935d0c19d7SBarry Smith 	ajnew = xitmp;
54945d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
54955d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5496606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
54975d0c19d7SBarry Smith 	ajfill = xitmp;
5498eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
54994e2b4712SSatish Balay       }
55005d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
55014e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
55024e2b4712SSatish Balay       dloc[prow]  = nzi;
55034e2b4712SSatish Balay       fm          = fill[n];
55044e2b4712SSatish Balay       while (nzf--) {
55055d0c19d7SBarry Smith 	*xitmp++ = fm;
55064e2b4712SSatish Balay 	*flev++ = im[fm];
55074e2b4712SSatish Balay 	fm      = fill[fm];
55084e2b4712SSatish Balay       }
5509435faa5fSBarry Smith       /* make sure row has diagonal entry */
5510435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
551177431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
55122401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5513435faa5fSBarry Smith       }
55144e2b4712SSatish Balay     }
5515606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
55164e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
55174e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5518606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
5519606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
55204e2b4712SSatish Balay 
55216cf91177SBarry Smith #if defined(PETSC_USE_INFO)
55224e2b4712SSatish Balay     {
5523329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5524ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5525ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5526ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5527ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5528335d9088SBarry Smith       if (diagonal_fill) {
5529ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5530335d9088SBarry Smith       }
55314e2b4712SSatish Balay     }
553263ba0a88SBarry Smith #endif
55334e2b4712SSatish Balay 
55344e2b4712SSatish Balay     /* put together the new matrix */
5535719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5536719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5537719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
5538e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
5539e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
55407c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
5541a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
55424e2b4712SSatish Balay     b->j          = ajnew;
55434e2b4712SSatish Balay     b->i          = ainew;
55444e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
55454e2b4712SSatish Balay     b->diag       = dloc;
55467f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
55474e2b4712SSatish Balay     b->ilen       = 0;
55484e2b4712SSatish Balay     b->imax       = 0;
55494e2b4712SSatish Balay     b->row        = isrow;
55504e2b4712SSatish Balay     b->col        = iscol;
5551bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5552c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5553c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5554e51c0b9cSSatish Balay     b->icol       = isicol;
555587828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
55564e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
55574e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
5558719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
55594e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
55604e2b4712SSatish Balay 
5561719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
5562719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
5563719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
55646bce7ff8SHong Zhang 
556541df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
55668661488fSKris Buschelman   PetscFunctionReturn(0);
55678661488fSKris Buschelman }
55688661488fSKris Buschelman 
5569732ee342SKris Buschelman #undef __FUNCT__
55707e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5571dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
55727e7071cdSKris Buschelman {
557312272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
557412272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
55755a9542e3SKris Buschelman   PetscFunctionBegin;
55767cf1b8d3SKris Buschelman   /* Undo Column scaling */
55777cf1b8d3SKris Buschelman /*    while (nz--) { */
55787cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
55797cf1b8d3SKris Buschelman /*    } */
5580c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
5581c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
55827cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
55837cf1b8d3SKris Buschelman }
55847cf1b8d3SKris Buschelman 
55857cf1b8d3SKris Buschelman #undef __FUNCT__
55867cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5587dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
55887cf1b8d3SKris Buschelman {
55897cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5590b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
55912aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
55925a9542e3SKris Buschelman   PetscFunctionBegin;
55930b9da03eSKris Buschelman   /* Is this really necessary? */
559420235379SKris Buschelman   while (nz--) {
55950b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
55967e7071cdSKris Buschelman   }
5597c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
55987e7071cdSKris Buschelman   PetscFunctionReturn(0);
55997e7071cdSKris Buschelman }
56007e7071cdSKris Buschelman 
5601732ee342SKris Buschelman 
5602