xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 6929473c87c3d067b830e278ca4437b6bd644cf7)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
118*6929473cSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct"
119*6929473cSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
120*6929473cSShri Abhyankar {
121*6929473cSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
122*6929473cSShri Abhyankar   PetscErrorCode ierr;
123*6929473cSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
124*6929473cSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
125*6929473cSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
126*6929473cSShri Abhyankar   MatScalar      *aa=a->a,*v;
127*6929473cSShri Abhyankar   PetscScalar    s1,s2,x1,x2;
128*6929473cSShri Abhyankar   PetscScalar    *x,*b;
129*6929473cSShri Abhyankar 
130*6929473cSShri Abhyankar   PetscFunctionBegin;
131*6929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132*6929473cSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
133*6929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134*6929473cSShri Abhyankar 
135*6929473cSShri Abhyankar   /* forward solve the U^T */
136*6929473cSShri Abhyankar   idx = 0;
137*6929473cSShri Abhyankar   for (i=0; i<n; i++) {
138*6929473cSShri Abhyankar     v     = aa + bs2*diag[i];
139*6929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
140*6929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
141*6929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
142*6929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
143*6929473cSShri Abhyankar     v -= bs2;
144*6929473cSShri Abhyankar 
145*6929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
146*6929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
147*6929473cSShri Abhyankar     for(j=0;j>-nz;j--){
148*6929473cSShri Abhyankar       oidx = bs*vi[j];
149*6929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
150*6929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
151*6929473cSShri Abhyankar       v  -= bs2;
152*6929473cSShri Abhyankar     }
153*6929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
154*6929473cSShri Abhyankar     idx += bs;
155*6929473cSShri Abhyankar   }
156*6929473cSShri Abhyankar   /* backward solve the L^T */
157*6929473cSShri Abhyankar   for (i=n-1; i>=0; i--){
158*6929473cSShri Abhyankar     v    = aa + bs2*ai[i];
159*6929473cSShri Abhyankar     vi   = aj + ai[i];
160*6929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
161*6929473cSShri Abhyankar     idt  = bs*i;
162*6929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
163*6929473cSShri Abhyankar     for(j=0;j<nz;j++){
164*6929473cSShri Abhyankar       idx   = bs*vi[j];
165*6929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
166*6929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
167*6929473cSShri Abhyankar       v += bs2;
168*6929473cSShri Abhyankar     }
169*6929473cSShri Abhyankar   }
170*6929473cSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
171*6929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172*6929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
173*6929473cSShri Abhyankar   PetscFunctionReturn(0);
174*6929473cSShri Abhyankar }
175*6929473cSShri Abhyankar 
176*6929473cSShri Abhyankar #undef __FUNCT__
1774a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
178dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
179f1af5d2fSBarry Smith {
180f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181dfbe8321SBarry Smith   PetscErrorCode ierr;
182690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
184f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18587828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
18687828ca2SBarry Smith   PetscScalar    *x,*b;
187f1af5d2fSBarry Smith 
188f1af5d2fSBarry Smith   PetscFunctionBegin;
189ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192f1af5d2fSBarry Smith 
193f1af5d2fSBarry Smith   /* forward solve the U^T */
194f1af5d2fSBarry Smith   idx = 0;
195f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
196f1af5d2fSBarry Smith 
197f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
198f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
199ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203f1af5d2fSBarry Smith     v += 9;
204f1af5d2fSBarry Smith 
205f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
206f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
207f1af5d2fSBarry Smith     while (nz--) {
208f1af5d2fSBarry Smith       oidx = 3*(*vi++);
209f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212f1af5d2fSBarry Smith       v  += 9;
213f1af5d2fSBarry Smith     }
214f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215f1af5d2fSBarry Smith     idx += 3;
216f1af5d2fSBarry Smith   }
217f1af5d2fSBarry Smith   /* backward solve the L^T */
218f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
219f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
220f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
221f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
222f1af5d2fSBarry Smith     idt  = 3*i;
223f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224f1af5d2fSBarry Smith     while (nz--) {
225f1af5d2fSBarry Smith       idx   = 3*(*vi--);
226f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229f1af5d2fSBarry Smith       v -= 9;
230f1af5d2fSBarry Smith     }
231f1af5d2fSBarry Smith   }
2321ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235f1af5d2fSBarry Smith   PetscFunctionReturn(0);
236f1af5d2fSBarry Smith }
237f1af5d2fSBarry Smith 
2384a2ae208SSatish Balay #undef __FUNCT__
2394a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
240dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
241f1af5d2fSBarry Smith {
242f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
243dfbe8321SBarry Smith   PetscErrorCode ierr;
244690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
245690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
246f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
24787828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
24887828ca2SBarry Smith   PetscScalar    *x,*b;
249f1af5d2fSBarry Smith 
250f1af5d2fSBarry Smith   PetscFunctionBegin;
251ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2521ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
254f1af5d2fSBarry Smith 
255f1af5d2fSBarry Smith   /* forward solve the U^T */
256f1af5d2fSBarry Smith   idx = 0;
257f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
258f1af5d2fSBarry Smith 
259f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
260f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
261ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
262f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
263f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
264f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
265f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
266f1af5d2fSBarry Smith     v += 16;
267f1af5d2fSBarry Smith 
268f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
269f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
270f1af5d2fSBarry Smith     while (nz--) {
271f1af5d2fSBarry Smith       oidx = 4*(*vi++);
272f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
273f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
274f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
275f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
276f1af5d2fSBarry Smith       v  += 16;
277f1af5d2fSBarry Smith     }
278f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
279f1af5d2fSBarry Smith     idx += 4;
280f1af5d2fSBarry Smith   }
281f1af5d2fSBarry Smith   /* backward solve the L^T */
282f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
283f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
284f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
285f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
286f1af5d2fSBarry Smith     idt  = 4*i;
287f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
288f1af5d2fSBarry Smith     while (nz--) {
289f1af5d2fSBarry Smith       idx   = 4*(*vi--);
290f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
291f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
292f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
293f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
294f1af5d2fSBarry Smith       v -= 16;
295f1af5d2fSBarry Smith     }
296f1af5d2fSBarry Smith   }
2971ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
299dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
300f1af5d2fSBarry Smith   PetscFunctionReturn(0);
301f1af5d2fSBarry Smith }
302f1af5d2fSBarry Smith 
3034a2ae208SSatish Balay #undef __FUNCT__
3044a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
305dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
306f1af5d2fSBarry Smith {
307f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
308dfbe8321SBarry Smith   PetscErrorCode ierr;
309690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
310690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
311f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
31287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
31387828ca2SBarry Smith   PetscScalar    *x,*b;
314f1af5d2fSBarry Smith 
315f1af5d2fSBarry Smith   PetscFunctionBegin;
316ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
319f1af5d2fSBarry Smith 
320f1af5d2fSBarry Smith   /* forward solve the U^T */
321f1af5d2fSBarry Smith   idx = 0;
322f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
323f1af5d2fSBarry Smith 
324f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
325f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
326ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
327f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
328f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
329f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
330f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
331f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
332f1af5d2fSBarry Smith     v += 25;
333f1af5d2fSBarry Smith 
334f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
335f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
336f1af5d2fSBarry Smith     while (nz--) {
337f1af5d2fSBarry Smith       oidx = 5*(*vi++);
338f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
339f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
340f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
341f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
342f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
343f1af5d2fSBarry Smith       v  += 25;
344f1af5d2fSBarry Smith     }
345f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
346f1af5d2fSBarry Smith     idx += 5;
347f1af5d2fSBarry Smith   }
348f1af5d2fSBarry Smith   /* backward solve the L^T */
349f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
350f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
351f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
352f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
353f1af5d2fSBarry Smith     idt  = 5*i;
354f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
355f1af5d2fSBarry Smith     while (nz--) {
356f1af5d2fSBarry Smith       idx   = 5*(*vi--);
357f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
358f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
359f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
360f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
361f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
362f1af5d2fSBarry Smith       v -= 25;
363f1af5d2fSBarry Smith     }
364f1af5d2fSBarry Smith   }
3651ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3661ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
367dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
368f1af5d2fSBarry Smith   PetscFunctionReturn(0);
369f1af5d2fSBarry Smith }
370f1af5d2fSBarry Smith 
3714a2ae208SSatish Balay #undef __FUNCT__
3724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
373dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
374f1af5d2fSBarry Smith {
375f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
376dfbe8321SBarry Smith   PetscErrorCode ierr;
377690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
378690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
379f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
38087828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
38187828ca2SBarry Smith   PetscScalar    *x,*b;
382f1af5d2fSBarry Smith 
383f1af5d2fSBarry Smith   PetscFunctionBegin;
384ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3851ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3861ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
387f1af5d2fSBarry Smith 
388f1af5d2fSBarry Smith   /* forward solve the U^T */
389f1af5d2fSBarry Smith   idx = 0;
390f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
391f1af5d2fSBarry Smith 
392f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
393f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
394ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
395ef66eb69SBarry Smith     x6    = x[5+idx];
396f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
397f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
398f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
399f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
400f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
401f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
402f1af5d2fSBarry Smith     v += 36;
403f1af5d2fSBarry Smith 
404f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
405f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
406f1af5d2fSBarry Smith     while (nz--) {
407f1af5d2fSBarry Smith       oidx = 6*(*vi++);
408f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
409f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
410f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
411f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
412f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
413f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
414f1af5d2fSBarry Smith       v  += 36;
415f1af5d2fSBarry Smith     }
416f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
417f1af5d2fSBarry Smith     x[5+idx] = s6;
418f1af5d2fSBarry Smith     idx += 6;
419f1af5d2fSBarry Smith   }
420f1af5d2fSBarry Smith   /* backward solve the L^T */
421f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
422f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
423f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
424f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
425f1af5d2fSBarry Smith     idt  = 6*i;
426f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
427f1af5d2fSBarry Smith     s6 = x[5+idt];
428f1af5d2fSBarry Smith     while (nz--) {
429f1af5d2fSBarry Smith       idx   = 6*(*vi--);
430f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
431f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
432f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
433f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
434f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
435f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
436f1af5d2fSBarry Smith       v -= 36;
437f1af5d2fSBarry Smith     }
438f1af5d2fSBarry Smith   }
4391ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
441dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
442f1af5d2fSBarry Smith   PetscFunctionReturn(0);
443f1af5d2fSBarry Smith }
444f1af5d2fSBarry Smith 
4454a2ae208SSatish Balay #undef __FUNCT__
4464a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
447dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
448f1af5d2fSBarry Smith {
449f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
450dfbe8321SBarry Smith   PetscErrorCode ierr;
451690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
452690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
453f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
45487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
45587828ca2SBarry Smith   PetscScalar    *x,*b;
456f1af5d2fSBarry Smith 
457f1af5d2fSBarry Smith   PetscFunctionBegin;
458ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4591ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4601ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
461f1af5d2fSBarry Smith 
462f1af5d2fSBarry Smith   /* forward solve the U^T */
463f1af5d2fSBarry Smith   idx = 0;
464f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
465f1af5d2fSBarry Smith 
466f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
467f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
468ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
469ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
470f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
471f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
472f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
473f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
474f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
475f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
476f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
477f1af5d2fSBarry Smith     v += 49;
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
480f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
481f1af5d2fSBarry Smith     while (nz--) {
482f1af5d2fSBarry Smith       oidx = 7*(*vi++);
483f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
484f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
485f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
486f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
487f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
488f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
489f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
490f1af5d2fSBarry Smith       v  += 49;
491f1af5d2fSBarry Smith     }
492f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
493f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
494f1af5d2fSBarry Smith     idx += 7;
495f1af5d2fSBarry Smith   }
496f1af5d2fSBarry Smith   /* backward solve the L^T */
497f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
498f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
499f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
500f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
501f1af5d2fSBarry Smith     idt  = 7*i;
502f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
503f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
504f1af5d2fSBarry Smith     while (nz--) {
505f1af5d2fSBarry Smith       idx   = 7*(*vi--);
506f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
507f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
508f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
509f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
510f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
511f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
512f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
513f1af5d2fSBarry Smith       v -= 49;
514f1af5d2fSBarry Smith     }
515f1af5d2fSBarry Smith   }
5161ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
518dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
519f1af5d2fSBarry Smith   PetscFunctionReturn(0);
520f1af5d2fSBarry Smith }
521f1af5d2fSBarry Smith 
522f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
5234a2ae208SSatish Balay #undef __FUNCT__
5244a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
525dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
526f1af5d2fSBarry Smith {
527f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
528f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5296849ba73SBarry Smith   PetscErrorCode ierr;
5305d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5315d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
532690b6cddSBarry Smith   PetscInt       *diag = a->diag;
533f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53487828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
535f1af5d2fSBarry Smith 
536f1af5d2fSBarry Smith   PetscFunctionBegin;
5371ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5381ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
539f1af5d2fSBarry Smith   t  = a->solve_work;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
542f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
543f1af5d2fSBarry Smith 
544f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
545f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
546f1af5d2fSBarry Smith     t[i] = b[c[i]];
547f1af5d2fSBarry Smith   }
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith   /* forward solve the U^T */
550f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
551f1af5d2fSBarry Smith 
552f1af5d2fSBarry Smith     v     = aa + diag[i];
553f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
554f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
555f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
556f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
557f1af5d2fSBarry Smith     while (nz--) {
558f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
559f1af5d2fSBarry Smith     }
560f1af5d2fSBarry Smith     t[i]   = s1;
561f1af5d2fSBarry Smith   }
562f1af5d2fSBarry Smith   /* backward solve the L^T */
563f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
564f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
565f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
566f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
567f1af5d2fSBarry Smith     s1   = t[i];
568f1af5d2fSBarry Smith     while (nz--) {
569f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
570f1af5d2fSBarry Smith     }
571f1af5d2fSBarry Smith   }
572f1af5d2fSBarry Smith 
573f1af5d2fSBarry Smith   /* copy t into x according to permutation */
574f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
575f1af5d2fSBarry Smith     x[r[i]]   = t[i];
576f1af5d2fSBarry Smith   }
577f1af5d2fSBarry Smith 
578f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
579f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5801ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
582dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
583f1af5d2fSBarry Smith   PetscFunctionReturn(0);
584f1af5d2fSBarry Smith }
585f1af5d2fSBarry Smith 
5864a2ae208SSatish Balay #undef __FUNCT__
5874a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
588dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
589f1af5d2fSBarry Smith {
590f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
591f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5926849ba73SBarry Smith   PetscErrorCode ierr;
5935d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5945d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
595690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
596f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
59787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
59887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
599f1af5d2fSBarry Smith 
600f1af5d2fSBarry Smith   PetscFunctionBegin;
6011ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6021ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
603f1af5d2fSBarry Smith   t  = a->solve_work;
604f1af5d2fSBarry Smith 
605f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
606f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
607f1af5d2fSBarry Smith 
608f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
609f1af5d2fSBarry Smith   ii = 0;
610f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
611f1af5d2fSBarry Smith     ic      = 2*c[i];
612f1af5d2fSBarry Smith     t[ii]   = b[ic];
613f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
614f1af5d2fSBarry Smith     ii += 2;
615f1af5d2fSBarry Smith   }
616f1af5d2fSBarry Smith 
617f1af5d2fSBarry Smith   /* forward solve the U^T */
618f1af5d2fSBarry Smith   idx = 0;
619f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
620f1af5d2fSBarry Smith 
621f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
622f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
623f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
624f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
625f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
626f1af5d2fSBarry Smith     v += 4;
627f1af5d2fSBarry Smith 
628f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
629f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
630f1af5d2fSBarry Smith     while (nz--) {
631f1af5d2fSBarry Smith       oidx = 2*(*vi++);
632f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
633f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
634f1af5d2fSBarry Smith       v  += 4;
635f1af5d2fSBarry Smith     }
636f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
637f1af5d2fSBarry Smith     idx += 2;
638f1af5d2fSBarry Smith   }
639f1af5d2fSBarry Smith   /* backward solve the L^T */
640f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
641f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
642f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
643f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
644f1af5d2fSBarry Smith     idt  = 2*i;
645f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
646f1af5d2fSBarry Smith     while (nz--) {
647f1af5d2fSBarry Smith       idx   = 2*(*vi--);
648f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
649f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
650f1af5d2fSBarry Smith       v -= 4;
651f1af5d2fSBarry Smith     }
652f1af5d2fSBarry Smith   }
653f1af5d2fSBarry Smith 
654f1af5d2fSBarry Smith   /* copy t into x according to permutation */
655f1af5d2fSBarry Smith   ii = 0;
656f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
657f1af5d2fSBarry Smith     ir      = 2*r[i];
658f1af5d2fSBarry Smith     x[ir]   = t[ii];
659f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
660f1af5d2fSBarry Smith     ii += 2;
661f1af5d2fSBarry Smith   }
662f1af5d2fSBarry Smith 
663f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
664f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6651ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6661ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
667dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
668f1af5d2fSBarry Smith   PetscFunctionReturn(0);
669f1af5d2fSBarry Smith }
670f1af5d2fSBarry Smith 
6714a2ae208SSatish Balay #undef __FUNCT__
6724a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
673dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
674f1af5d2fSBarry Smith {
675f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
676f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6776849ba73SBarry Smith   PetscErrorCode ierr;
6785d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6795d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
680690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
681f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
68287828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
68387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
684f1af5d2fSBarry Smith 
685f1af5d2fSBarry Smith   PetscFunctionBegin;
6861ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6871ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
688f1af5d2fSBarry Smith   t  = a->solve_work;
689f1af5d2fSBarry Smith 
690f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
691f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
692f1af5d2fSBarry Smith 
693f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
694f1af5d2fSBarry Smith   ii = 0;
695f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
696f1af5d2fSBarry Smith     ic      = 3*c[i];
697f1af5d2fSBarry Smith     t[ii]   = b[ic];
698f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
699f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
700f1af5d2fSBarry Smith     ii += 3;
701f1af5d2fSBarry Smith   }
702f1af5d2fSBarry Smith 
703f1af5d2fSBarry Smith   /* forward solve the U^T */
704f1af5d2fSBarry Smith   idx = 0;
705f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
706f1af5d2fSBarry Smith 
707f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
708f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
709f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
710f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
711f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
712f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
713f1af5d2fSBarry Smith     v += 9;
714f1af5d2fSBarry Smith 
715f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
716f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
717f1af5d2fSBarry Smith     while (nz--) {
718f1af5d2fSBarry Smith       oidx = 3*(*vi++);
719f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
720f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
721f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
722f1af5d2fSBarry Smith       v  += 9;
723f1af5d2fSBarry Smith     }
724f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
725f1af5d2fSBarry Smith     idx += 3;
726f1af5d2fSBarry Smith   }
727f1af5d2fSBarry Smith   /* backward solve the L^T */
728f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
729f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
730f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
731f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
732f1af5d2fSBarry Smith     idt  = 3*i;
733f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
734f1af5d2fSBarry Smith     while (nz--) {
735f1af5d2fSBarry Smith       idx   = 3*(*vi--);
736f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
737f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
738f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
739f1af5d2fSBarry Smith       v -= 9;
740f1af5d2fSBarry Smith     }
741f1af5d2fSBarry Smith   }
742f1af5d2fSBarry Smith 
743f1af5d2fSBarry Smith   /* copy t into x according to permutation */
744f1af5d2fSBarry Smith   ii = 0;
745f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
746f1af5d2fSBarry Smith     ir      = 3*r[i];
747f1af5d2fSBarry Smith     x[ir]   = t[ii];
748f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
749f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
750f1af5d2fSBarry Smith     ii += 3;
751f1af5d2fSBarry Smith   }
752f1af5d2fSBarry Smith 
753f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
754f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7551ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
757dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
758f1af5d2fSBarry Smith   PetscFunctionReturn(0);
759f1af5d2fSBarry Smith }
760f1af5d2fSBarry Smith 
7614a2ae208SSatish Balay #undef __FUNCT__
7624a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
763dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
764f1af5d2fSBarry Smith {
765f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
766f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7676849ba73SBarry Smith   PetscErrorCode ierr;
7685d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7695d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
770690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
771f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
77287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
77387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
774f1af5d2fSBarry Smith 
775f1af5d2fSBarry Smith   PetscFunctionBegin;
7761ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
778f1af5d2fSBarry Smith   t  = a->solve_work;
779f1af5d2fSBarry Smith 
780f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
781f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
782f1af5d2fSBarry Smith 
783f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
784f1af5d2fSBarry Smith   ii = 0;
785f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
786f1af5d2fSBarry Smith     ic      = 4*c[i];
787f1af5d2fSBarry Smith     t[ii]   = b[ic];
788f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
789f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
790f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
791f1af5d2fSBarry Smith     ii += 4;
792f1af5d2fSBarry Smith   }
793f1af5d2fSBarry Smith 
794f1af5d2fSBarry Smith   /* forward solve the U^T */
795f1af5d2fSBarry Smith   idx = 0;
796f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
797f1af5d2fSBarry Smith 
798f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
799f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
800f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
801f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
802f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
803f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
804f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
805f1af5d2fSBarry Smith     v += 16;
806f1af5d2fSBarry Smith 
807f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
808f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
809f1af5d2fSBarry Smith     while (nz--) {
810f1af5d2fSBarry Smith       oidx = 4*(*vi++);
811f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
812f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
813f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
814f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
815f1af5d2fSBarry Smith       v  += 16;
816f1af5d2fSBarry Smith     }
817f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
818f1af5d2fSBarry Smith     idx += 4;
819f1af5d2fSBarry Smith   }
820f1af5d2fSBarry Smith   /* backward solve the L^T */
821f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
822f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
823f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
824f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
825f1af5d2fSBarry Smith     idt  = 4*i;
826f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
827f1af5d2fSBarry Smith     while (nz--) {
828f1af5d2fSBarry Smith       idx   = 4*(*vi--);
829f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
830f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
831f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
832f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
833f1af5d2fSBarry Smith       v -= 16;
834f1af5d2fSBarry Smith     }
835f1af5d2fSBarry Smith   }
836f1af5d2fSBarry Smith 
837f1af5d2fSBarry Smith   /* copy t into x according to permutation */
838f1af5d2fSBarry Smith   ii = 0;
839f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
840f1af5d2fSBarry Smith     ir      = 4*r[i];
841f1af5d2fSBarry Smith     x[ir]   = t[ii];
842f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
843f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
844f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
845f1af5d2fSBarry Smith     ii += 4;
846f1af5d2fSBarry Smith   }
847f1af5d2fSBarry Smith 
848f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
849f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
852dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
853f1af5d2fSBarry Smith   PetscFunctionReturn(0);
854f1af5d2fSBarry Smith }
855f1af5d2fSBarry Smith 
8564a2ae208SSatish Balay #undef __FUNCT__
8574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
858dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
859f1af5d2fSBarry Smith {
860f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
861f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8626849ba73SBarry Smith   PetscErrorCode ierr;
8635d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8645d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
865690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
866f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
86787828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
86887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
869f1af5d2fSBarry Smith 
870f1af5d2fSBarry Smith   PetscFunctionBegin;
8711ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8721ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
873f1af5d2fSBarry Smith   t  = a->solve_work;
874f1af5d2fSBarry Smith 
875f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
876f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
877f1af5d2fSBarry Smith 
878f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
879f1af5d2fSBarry Smith   ii = 0;
880f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
881f1af5d2fSBarry Smith     ic      = 5*c[i];
882f1af5d2fSBarry Smith     t[ii]   = b[ic];
883f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
884f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
885f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
886f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
887f1af5d2fSBarry Smith     ii += 5;
888f1af5d2fSBarry Smith   }
889f1af5d2fSBarry Smith 
890f1af5d2fSBarry Smith   /* forward solve the U^T */
891f1af5d2fSBarry Smith   idx = 0;
892f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
893f1af5d2fSBarry Smith 
894f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
895f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
896f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
897f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
898f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
899f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
900f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
901f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
902f1af5d2fSBarry Smith     v += 25;
903f1af5d2fSBarry Smith 
904f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
905f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
906f1af5d2fSBarry Smith     while (nz--) {
907f1af5d2fSBarry Smith       oidx = 5*(*vi++);
908f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
909f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
910f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
911f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
912f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
913f1af5d2fSBarry Smith       v  += 25;
914f1af5d2fSBarry Smith     }
915f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
916f1af5d2fSBarry Smith     idx += 5;
917f1af5d2fSBarry Smith   }
918f1af5d2fSBarry Smith   /* backward solve the L^T */
919f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
920f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
921f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
922f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
923f1af5d2fSBarry Smith     idt  = 5*i;
924f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
925f1af5d2fSBarry Smith     while (nz--) {
926f1af5d2fSBarry Smith       idx   = 5*(*vi--);
927f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
928f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
929f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
930f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
931f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
932f1af5d2fSBarry Smith       v -= 25;
933f1af5d2fSBarry Smith     }
934f1af5d2fSBarry Smith   }
935f1af5d2fSBarry Smith 
936f1af5d2fSBarry Smith   /* copy t into x according to permutation */
937f1af5d2fSBarry Smith   ii = 0;
938f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
939f1af5d2fSBarry Smith     ir      = 5*r[i];
940f1af5d2fSBarry Smith     x[ir]   = t[ii];
941f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
942f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
943f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
944f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
945f1af5d2fSBarry Smith     ii += 5;
946f1af5d2fSBarry Smith   }
947f1af5d2fSBarry Smith 
948f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
949f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
952dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
953f1af5d2fSBarry Smith   PetscFunctionReturn(0);
954f1af5d2fSBarry Smith }
955f1af5d2fSBarry Smith 
9564a2ae208SSatish Balay #undef __FUNCT__
9574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
958dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
959f1af5d2fSBarry Smith {
960f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
961f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9626849ba73SBarry Smith   PetscErrorCode ierr;
9635d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9645d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
965690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
966f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
96787828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
96887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
969f1af5d2fSBarry Smith 
970f1af5d2fSBarry Smith   PetscFunctionBegin;
9711ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9721ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
973f1af5d2fSBarry Smith   t  = a->solve_work;
974f1af5d2fSBarry Smith 
975f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
976f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
977f1af5d2fSBarry Smith 
978f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
979f1af5d2fSBarry Smith   ii = 0;
980f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
981f1af5d2fSBarry Smith     ic      = 6*c[i];
982f1af5d2fSBarry Smith     t[ii]   = b[ic];
983f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
984f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
985f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
986f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
987f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
988f1af5d2fSBarry Smith     ii += 6;
989f1af5d2fSBarry Smith   }
990f1af5d2fSBarry Smith 
991f1af5d2fSBarry Smith   /* forward solve the U^T */
992f1af5d2fSBarry Smith   idx = 0;
993f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
994f1af5d2fSBarry Smith 
995f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
996f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
997f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
998f1af5d2fSBarry Smith     x6    = t[5+idx];
999f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1000f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1001f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1002f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1003f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1004f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1005f1af5d2fSBarry Smith     v += 36;
1006f1af5d2fSBarry Smith 
1007f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1008f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1009f1af5d2fSBarry Smith     while (nz--) {
1010f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1011f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1012f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1013f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1014f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1015f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1016f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1017f1af5d2fSBarry Smith       v  += 36;
1018f1af5d2fSBarry Smith     }
1019f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1020f1af5d2fSBarry Smith     t[5+idx] = s6;
1021f1af5d2fSBarry Smith     idx += 6;
1022f1af5d2fSBarry Smith   }
1023f1af5d2fSBarry Smith   /* backward solve the L^T */
1024f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1025f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1026f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1027f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1028f1af5d2fSBarry Smith     idt  = 6*i;
1029f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1030f1af5d2fSBarry Smith     s6 = t[5+idt];
1031f1af5d2fSBarry Smith     while (nz--) {
1032f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1033f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1034f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1035f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1036f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1037f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1038f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1039f1af5d2fSBarry Smith       v -= 36;
1040f1af5d2fSBarry Smith     }
1041f1af5d2fSBarry Smith   }
1042f1af5d2fSBarry Smith 
1043f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1044f1af5d2fSBarry Smith   ii = 0;
1045f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1046f1af5d2fSBarry Smith     ir      = 6*r[i];
1047f1af5d2fSBarry Smith     x[ir]   = t[ii];
1048f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1049f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1050f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1051f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1052f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1053f1af5d2fSBarry Smith     ii += 6;
1054f1af5d2fSBarry Smith   }
1055f1af5d2fSBarry Smith 
1056f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1057f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10581ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1060dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1061f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1062f1af5d2fSBarry Smith }
1063f1af5d2fSBarry Smith 
10644a2ae208SSatish Balay #undef __FUNCT__
10654a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1066dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1067f1af5d2fSBarry Smith {
1068f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1069f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10706849ba73SBarry Smith   PetscErrorCode ierr;
10715d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10725d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1073690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1074f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
107587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
107687828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1077f1af5d2fSBarry Smith 
1078f1af5d2fSBarry Smith   PetscFunctionBegin;
10791ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1081f1af5d2fSBarry Smith   t  = a->solve_work;
1082f1af5d2fSBarry Smith 
1083f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1084f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1085f1af5d2fSBarry Smith 
1086f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1087f1af5d2fSBarry Smith   ii = 0;
1088f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1089f1af5d2fSBarry Smith     ic      = 7*c[i];
1090f1af5d2fSBarry Smith     t[ii]   = b[ic];
1091f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1092f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1093f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1094f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1095f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1096f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1097f1af5d2fSBarry Smith     ii += 7;
1098f1af5d2fSBarry Smith   }
1099f1af5d2fSBarry Smith 
1100f1af5d2fSBarry Smith   /* forward solve the U^T */
1101f1af5d2fSBarry Smith   idx = 0;
1102f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1103f1af5d2fSBarry Smith 
1104f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1105f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1106f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1107f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1108f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1109f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1110f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1111f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1112f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1113f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1114f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1115f1af5d2fSBarry Smith     v += 49;
1116f1af5d2fSBarry Smith 
1117f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1118f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1119f1af5d2fSBarry Smith     while (nz--) {
1120f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1121f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1122f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1123f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1124f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1125f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1126f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1127f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1128f1af5d2fSBarry Smith       v  += 49;
1129f1af5d2fSBarry Smith     }
1130f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1131f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1132f1af5d2fSBarry Smith     idx += 7;
1133f1af5d2fSBarry Smith   }
1134f1af5d2fSBarry Smith   /* backward solve the L^T */
1135f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1136f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1137f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1138f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1139f1af5d2fSBarry Smith     idt  = 7*i;
1140f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1141f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1142f1af5d2fSBarry Smith     while (nz--) {
1143f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1144f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1145f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1146f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1147f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1148f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1149f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1150f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1151f1af5d2fSBarry Smith       v -= 49;
1152f1af5d2fSBarry Smith     }
1153f1af5d2fSBarry Smith   }
1154f1af5d2fSBarry Smith 
1155f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1156f1af5d2fSBarry Smith   ii = 0;
1157f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1158f1af5d2fSBarry Smith     ir      = 7*r[i];
1159f1af5d2fSBarry Smith     x[ir]   = t[ii];
1160f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1161f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1162f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1163f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1164f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1165f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1166f1af5d2fSBarry Smith     ii += 7;
1167f1af5d2fSBarry Smith   }
1168f1af5d2fSBarry Smith 
1169f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1170f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11711ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1173dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1174f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1175f1af5d2fSBarry Smith }
1176f1af5d2fSBarry Smith 
11774e2b4712SSatish Balay /* ----------------------------------------------------------- */
11784a2ae208SSatish Balay #undef __FUNCT__
11794a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1180dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11814e2b4712SSatish Balay {
11824e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11834e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11846849ba73SBarry Smith   PetscErrorCode ierr;
11855d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11865d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11875d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11883f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118987828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11904e2b4712SSatish Balay 
11914e2b4712SSatish Balay   PetscFunctionBegin;
11921ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11931ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1194f1af5d2fSBarry Smith   t  = a->solve_work;
11954e2b4712SSatish Balay 
11964e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11974e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11984e2b4712SSatish Balay 
11994e2b4712SSatish Balay   /* forward solve the lower triangular */
120087828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
12014e2b4712SSatish Balay   for (i=1; i<n; i++) {
12024e2b4712SSatish Balay     v   = aa + bs2*ai[i];
12034e2b4712SSatish Balay     vi  = aj + ai[i];
12044e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1205f1af5d2fSBarry Smith     s = t + bs*i;
120687828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
12074e2b4712SSatish Balay     while (nz--) {
1208f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
12094e2b4712SSatish Balay       v += bs2;
12104e2b4712SSatish Balay     }
12114e2b4712SSatish Balay   }
12124e2b4712SSatish Balay   /* backward solve the upper triangular */
1213d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
12144e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12154e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
12164e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
12174e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
121887828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
12194e2b4712SSatish Balay     while (nz--) {
1220f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
12214e2b4712SSatish Balay       v += bs2;
12224e2b4712SSatish Balay     }
1223f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
122487828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
12254e2b4712SSatish Balay   }
12264e2b4712SSatish Balay 
12274e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12284e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12291ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12301ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1231dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
12324e2b4712SSatish Balay   PetscFunctionReturn(0);
12334e2b4712SSatish Balay }
12344e2b4712SSatish Balay 
12355c42ef9dSBarry Smith /* ----------------------------------------------------------- */
12365c42ef9dSBarry Smith #undef __FUNCT__
12375c42ef9dSBarry Smith #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
12385c42ef9dSBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
12395c42ef9dSBarry Smith {
12405c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
12415c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
12425c42ef9dSBarry Smith   PetscErrorCode    ierr;
12435c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
12445c42ef9dSBarry Smith   PetscInt          i,n=a->mbs,j;
12455c42ef9dSBarry Smith   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
12465c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
12475c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
12485c42ef9dSBarry Smith   const PetscScalar *b;
12495c42ef9dSBarry Smith   PetscFunctionBegin;
12505c42ef9dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12515c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
12525c42ef9dSBarry Smith   t    = a->solve_work;
12535c42ef9dSBarry Smith 
12545c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
12555c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
12565c42ef9dSBarry Smith 
12575c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
12585c42ef9dSBarry Smith   for (i=0; i<n; i++) {
12595c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
12605c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
12615c42ef9dSBarry Smith     }
12625c42ef9dSBarry Smith   }
12635c42ef9dSBarry Smith 
12645c42ef9dSBarry Smith 
12655c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
12665c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
12675c42ef9dSBarry Smith   for (i=0; i<n; i++){
12685c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
12695c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
12705c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
12715c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
12725c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
12735c42ef9dSBarry Smith     while (nz--) {
12745c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
12755c42ef9dSBarry Smith       v += bs2;
12765c42ef9dSBarry Smith     }
12775c42ef9dSBarry Smith   }
12785c42ef9dSBarry Smith 
12795c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
12805c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
12815c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
12825c42ef9dSBarry Smith     vi  = aj + ai[i];
12835c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
12845c42ef9dSBarry Smith     while (nz--) {
12855c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
12865c42ef9dSBarry Smith       v += bs2;
12875c42ef9dSBarry Smith     }
12885c42ef9dSBarry Smith   }
12895c42ef9dSBarry Smith 
12905c42ef9dSBarry Smith   /* copy t into x according to permutation */
12915c42ef9dSBarry Smith   for (i=0; i<n; i++) {
12925c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
12935c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
12945c42ef9dSBarry Smith     }
12955c42ef9dSBarry Smith   }
12965c42ef9dSBarry Smith 
12975c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12985c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12995c42ef9dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13005c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
13015c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
13025c42ef9dSBarry Smith   PetscFunctionReturn(0);
13035c42ef9dSBarry Smith }
13045c42ef9dSBarry Smith 
13054a2ae208SSatish Balay #undef __FUNCT__
13064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1307dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
13084e2b4712SSatish Balay {
13094e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
13104e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
13116849ba73SBarry Smith   PetscErrorCode ierr;
13125d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
13135d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
13143f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
131587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
131687828ca2SBarry Smith   PetscScalar    *x,*b,*t;
13174e2b4712SSatish Balay 
13184e2b4712SSatish Balay   PetscFunctionBegin;
13191ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
13201ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1321f1af5d2fSBarry Smith   t  = a->solve_work;
13224e2b4712SSatish Balay 
13234e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
13244e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
13254e2b4712SSatish Balay 
13264e2b4712SSatish Balay   /* forward solve the lower triangular */
13274e2b4712SSatish Balay   idx    = 7*(*r++);
1328f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1329f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1330f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
13314e2b4712SSatish Balay 
13324e2b4712SSatish Balay   for (i=1; i<n; i++) {
13334e2b4712SSatish Balay     v     = aa + 49*ai[i];
13344e2b4712SSatish Balay     vi    = aj + ai[i];
13354e2b4712SSatish Balay     nz    = diag[i] - ai[i];
13364e2b4712SSatish Balay     idx   = 7*(*r++);
1337f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1338f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
13394e2b4712SSatish Balay     while (nz--) {
13404e2b4712SSatish Balay       idx   = 7*(*vi++);
1341f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1342f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1343f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1344f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1345f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1346f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1347f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1348f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1349f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1350f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13514e2b4712SSatish Balay       v += 49;
13524e2b4712SSatish Balay     }
13534e2b4712SSatish Balay     idx = 7*i;
1354f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1355f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1356f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
13574e2b4712SSatish Balay   }
13584e2b4712SSatish Balay   /* backward solve the upper triangular */
13594e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
13604e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
13614e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
13624e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
13634e2b4712SSatish Balay     idt  = 7*i;
1364f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1365f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1366f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
13674e2b4712SSatish Balay     while (nz--) {
13684e2b4712SSatish Balay       idx   = 7*(*vi++);
1369f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1370f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1371f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1372f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1373f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1374f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1375f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1376f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1377f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1378f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13794e2b4712SSatish Balay       v += 49;
13804e2b4712SSatish Balay     }
13814e2b4712SSatish Balay     idc = 7*(*c--);
13824e2b4712SSatish Balay     v   = aa + 49*diag[i];
1383f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1384f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1385f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1386f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1387f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1388f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1389f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1390f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1391f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1392f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1393f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1394f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1395f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1396f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13974e2b4712SSatish Balay   }
13984e2b4712SSatish Balay 
13994e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
14004e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14011ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
14021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1403dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
14044e2b4712SSatish Balay   PetscFunctionReturn(0);
14054e2b4712SSatish Balay }
14064e2b4712SSatish Balay 
14078f690400SShri Abhyankar #undef __FUNCT__
1408a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1409a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
141035aa4fcfSShri Abhyankar {
141135aa4fcfSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
141235aa4fcfSShri Abhyankar   IS             iscol=a->col,isrow=a->row;
141335aa4fcfSShri Abhyankar   PetscErrorCode ierr;
141435aa4fcfSShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
141535aa4fcfSShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
141635aa4fcfSShri Abhyankar   MatScalar      *aa=a->a,*v;
141735aa4fcfSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
141835aa4fcfSShri Abhyankar   PetscScalar    *x,*b,*t;
141935aa4fcfSShri Abhyankar 
142035aa4fcfSShri Abhyankar   PetscFunctionBegin;
142135aa4fcfSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
142235aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
142335aa4fcfSShri Abhyankar   t  = a->solve_work;
142435aa4fcfSShri Abhyankar 
142535aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
142635aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
142735aa4fcfSShri Abhyankar 
142835aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
142935aa4fcfSShri Abhyankar   idx    = 7*r[0];
143035aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
143135aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
143235aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
143335aa4fcfSShri Abhyankar 
143435aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
143535aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
143635aa4fcfSShri Abhyankar     vi    = aj + ai[i];
143735aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
143835aa4fcfSShri Abhyankar     idx   = 7*r[i];
143935aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
144035aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
144135aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
144235aa4fcfSShri Abhyankar       idx   = 7*vi[m];
144335aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
144435aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
144535aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
144635aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
144735aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
144835aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
144935aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
145035aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
145135aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
145235aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
145335aa4fcfSShri Abhyankar       v += 49;
145435aa4fcfSShri Abhyankar     }
145535aa4fcfSShri Abhyankar     idx = 7*i;
145635aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
145735aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
145835aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
145935aa4fcfSShri Abhyankar   }
146035aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
146135aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
146235aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
146335aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
146435aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
146535aa4fcfSShri Abhyankar     idt  = 7*i;
146635aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
146735aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
146835aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
146935aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
147035aa4fcfSShri Abhyankar       idx   = 7*vi[m];
147135aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
147235aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
147335aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
147435aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
147535aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
147635aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
147735aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
147835aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
147935aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
148035aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
148135aa4fcfSShri Abhyankar       v += 49;
148235aa4fcfSShri Abhyankar     }
148335aa4fcfSShri Abhyankar     idc = 7*c[i];
148435aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
148535aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
148635aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
148735aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
148835aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
148935aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
149035aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
149135aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
149235aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
149335aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
149435aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
149535aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
149635aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
149735aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
149835aa4fcfSShri Abhyankar   }
149935aa4fcfSShri Abhyankar 
150035aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
150135aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
150235aa4fcfSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
150335aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
150435aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
150535aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
150635aa4fcfSShri Abhyankar }
150735aa4fcfSShri Abhyankar 
150835aa4fcfSShri Abhyankar #undef __FUNCT__
15094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1510dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
151115091d37SBarry Smith {
151215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1513690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1514dfbe8321SBarry Smith   PetscErrorCode    ierr;
1515690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1516d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1517d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1518d9fead3dSBarry Smith   const PetscScalar *b;
151915091d37SBarry Smith 
152015091d37SBarry Smith   PetscFunctionBegin;
1521d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
152315091d37SBarry Smith   /* forward solve the lower triangular */
152415091d37SBarry Smith   idx    = 0;
152515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
152615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
152715091d37SBarry Smith   x[6] = b[6+idx];
152815091d37SBarry Smith   for (i=1; i<n; i++) {
152915091d37SBarry Smith     v     =  aa + 49*ai[i];
153015091d37SBarry Smith     vi    =  aj + ai[i];
153115091d37SBarry Smith     nz    =  diag[i] - ai[i];
153215091d37SBarry Smith     idx   =  7*i;
1533f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1534f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1535f1af5d2fSBarry Smith     s7  =  b[6+idx];
153615091d37SBarry Smith     while (nz--) {
153715091d37SBarry Smith       jdx   = 7*(*vi++);
153815091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
153915091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
154015091d37SBarry Smith       x7    = x[6+jdx];
1541f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1542f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1543f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1544f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1545f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1546f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1547f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
154815091d37SBarry Smith       v += 49;
154915091d37SBarry Smith      }
1550f1af5d2fSBarry Smith     x[idx]   = s1;
1551f1af5d2fSBarry Smith     x[1+idx] = s2;
1552f1af5d2fSBarry Smith     x[2+idx] = s3;
1553f1af5d2fSBarry Smith     x[3+idx] = s4;
1554f1af5d2fSBarry Smith     x[4+idx] = s5;
1555f1af5d2fSBarry Smith     x[5+idx] = s6;
1556f1af5d2fSBarry Smith     x[6+idx] = s7;
155715091d37SBarry Smith   }
155815091d37SBarry Smith   /* backward solve the upper triangular */
155915091d37SBarry Smith   for (i=n-1; i>=0; i--){
156015091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
156115091d37SBarry Smith     vi   = aj + diag[i] + 1;
156215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
156315091d37SBarry Smith     idt  = 7*i;
1564f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1565f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1566f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1567f1af5d2fSBarry Smith     s7 = x[6+idt];
156815091d37SBarry Smith     while (nz--) {
156915091d37SBarry Smith       idx   = 7*(*vi++);
157015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
157115091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
157215091d37SBarry Smith       x7    = x[6+idx];
1573f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1574f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1575f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1576f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1577f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1578f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1579f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
158015091d37SBarry Smith       v += 49;
158115091d37SBarry Smith     }
158215091d37SBarry Smith     v        = aa + 49*diag[i];
1583f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1584f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1585f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1586f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1587f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1588f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1589f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1590f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1591f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1592f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1593f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1594f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1595f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1596f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
159715091d37SBarry Smith   }
159815091d37SBarry Smith 
1599d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1601dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
160215091d37SBarry Smith   PetscFunctionReturn(0);
160315091d37SBarry Smith }
160415091d37SBarry Smith 
1605cee9d6f2SShri Abhyankar #undef __FUNCT__
1606a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1607a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
160853cca76cSShri Abhyankar {
160953cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
161053cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
161153cca76cSShri Abhyankar     PetscErrorCode    ierr;
161253cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
161353cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
161453cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
161553cca76cSShri Abhyankar     PetscScalar       *x;
161653cca76cSShri Abhyankar     const PetscScalar *b;
161753cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
161853cca76cSShri Abhyankar 
161953cca76cSShri Abhyankar     PetscFunctionBegin;
162053cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
162153cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
162253cca76cSShri Abhyankar     /* forward solve the lower triangular */
162353cca76cSShri Abhyankar     idx    = 0;
162453cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
162553cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
162653cca76cSShri Abhyankar     for (i=1; i<n; i++) {
162753cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
162853cca76cSShri Abhyankar        vi   = aj + ai[i];
162953cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
163053cca76cSShri Abhyankar       idx   = bs*i;
163153cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
163253cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
163353cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
163453cca76cSShri Abhyankar           jdx   = bs*vi[k];
163553cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
163653cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
163753cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
163853cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
163953cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
164053cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
164153cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
164253cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
164353cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
164453cca76cSShri Abhyankar           v   +=  bs2;
164553cca76cSShri Abhyankar         }
164653cca76cSShri Abhyankar 
164753cca76cSShri Abhyankar        x[idx]   = s1;
164853cca76cSShri Abhyankar        x[1+idx] = s2;
164953cca76cSShri Abhyankar        x[2+idx] = s3;
165053cca76cSShri Abhyankar        x[3+idx] = s4;
165153cca76cSShri Abhyankar        x[4+idx] = s5;
165253cca76cSShri Abhyankar        x[5+idx] = s6;
165353cca76cSShri Abhyankar        x[6+idx] = s7;
165453cca76cSShri Abhyankar     }
165553cca76cSShri Abhyankar 
165653cca76cSShri Abhyankar    /* backward solve the upper triangular */
165753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
165853cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
165953cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
166053cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
166153cca76cSShri Abhyankar      idt = bs*i;
166253cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
166353cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
166453cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
166553cca76cSShri Abhyankar       idx   = bs*vi[k];
166653cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
166753cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
166853cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
166953cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
167053cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
167153cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
167253cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
167353cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
167453cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
167553cca76cSShri Abhyankar         v   +=  bs2;
167653cca76cSShri Abhyankar     }
167753cca76cSShri Abhyankar     /* x = inv_diagonal*x */
167853cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
167953cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
168053cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
168153cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
168253cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
168353cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
168453cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
168553cca76cSShri Abhyankar   }
168653cca76cSShri Abhyankar 
168753cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
168853cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
168953cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
169053cca76cSShri Abhyankar   PetscFunctionReturn(0);
169153cca76cSShri Abhyankar }
169253cca76cSShri Abhyankar 
169353cca76cSShri Abhyankar #undef __FUNCT__
16944a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1695dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
169615091d37SBarry Smith {
169715091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
169815091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
16996849ba73SBarry Smith   PetscErrorCode    ierr;
17005d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
17015d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1702d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1703d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1704d9fead3dSBarry Smith   const PetscScalar *b;
170515091d37SBarry Smith   PetscFunctionBegin;
1706d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17071ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1708f1af5d2fSBarry Smith   t  = a->solve_work;
170915091d37SBarry Smith 
171015091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
171115091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
171215091d37SBarry Smith 
171315091d37SBarry Smith   /* forward solve the lower triangular */
171415091d37SBarry Smith   idx    = 6*(*r++);
1715f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1716f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1717f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
171815091d37SBarry Smith   for (i=1; i<n; i++) {
171915091d37SBarry Smith     v     = aa + 36*ai[i];
172015091d37SBarry Smith     vi    = aj + ai[i];
172115091d37SBarry Smith     nz    = diag[i] - ai[i];
172215091d37SBarry Smith     idx   = 6*(*r++);
1723f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1724f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
172515091d37SBarry Smith     while (nz--) {
172615091d37SBarry Smith       idx   = 6*(*vi++);
1727f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1728f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1729f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1730f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1731f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1732f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1733f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1734f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
173515091d37SBarry Smith       v += 36;
173615091d37SBarry Smith     }
173715091d37SBarry Smith     idx = 6*i;
1738f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1739f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1740f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
174115091d37SBarry Smith   }
174215091d37SBarry Smith   /* backward solve the upper triangular */
174315091d37SBarry Smith   for (i=n-1; i>=0; i--){
174415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
174515091d37SBarry Smith     vi   = aj + diag[i] + 1;
174615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
174715091d37SBarry Smith     idt  = 6*i;
1748f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1749f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1750f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
175115091d37SBarry Smith     while (nz--) {
175215091d37SBarry Smith       idx   = 6*(*vi++);
1753f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1754f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1755f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1756f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1757f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1758f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1759f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1760f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1761f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
176215091d37SBarry Smith       v += 36;
176315091d37SBarry Smith     }
176415091d37SBarry Smith     idc = 6*(*c--);
176515091d37SBarry Smith     v   = aa + 36*diag[i];
1766f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1767f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1768f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1769f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1770f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1771f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1772f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1773f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1774f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1775f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1776f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1777f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
177815091d37SBarry Smith   }
177915091d37SBarry Smith 
178015091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
178115091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1782d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17831ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1784dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
178515091d37SBarry Smith   PetscFunctionReturn(0);
178615091d37SBarry Smith }
178715091d37SBarry Smith 
17886506fda5SShri Abhyankar #undef __FUNCT__
1789a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
1790a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
17916506fda5SShri Abhyankar {
17926506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17936506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
17946506fda5SShri Abhyankar   PetscErrorCode    ierr;
17956506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
17966506fda5SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
17976506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
17986506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
17996506fda5SShri Abhyankar   const PetscScalar *b;
18006506fda5SShri Abhyankar   PetscFunctionBegin;
18016506fda5SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18026506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
18036506fda5SShri Abhyankar   t  = a->solve_work;
18046506fda5SShri Abhyankar 
18056506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
18066506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
18076506fda5SShri Abhyankar 
18086506fda5SShri Abhyankar   /* forward solve the lower triangular */
18096506fda5SShri Abhyankar   idx    = 6*r[0];
18106506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
18116506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
18126506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
18136506fda5SShri Abhyankar   for (i=1; i<n; i++) {
18146506fda5SShri Abhyankar     v     = aa + 36*ai[i];
18156506fda5SShri Abhyankar     vi    = aj + ai[i];
18166506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
18176506fda5SShri Abhyankar     idx   = 6*r[i];
18186506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
18196506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
18206506fda5SShri Abhyankar     for(m=0;m<nz;m++){
18216506fda5SShri Abhyankar       idx   = 6*vi[m];
18226506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
18236506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
18246506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
18256506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
18266506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
18276506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
18286506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
18296506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
18306506fda5SShri Abhyankar       v += 36;
18316506fda5SShri Abhyankar     }
18326506fda5SShri Abhyankar     idx = 6*i;
18336506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
18346506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
18356506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
18366506fda5SShri Abhyankar   }
18376506fda5SShri Abhyankar   /* backward solve the upper triangular */
18386506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
18396506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
18406506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
18416506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
18426506fda5SShri Abhyankar     idt  = 6*i;
18436506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
18446506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
18456506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
18466506fda5SShri Abhyankar     for(m=0;m<nz;m++){
18476506fda5SShri Abhyankar       idx   = 6*vi[m];
18486506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
18496506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
18506506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
18516506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
18526506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
18536506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
18546506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
18556506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
18566506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
18576506fda5SShri Abhyankar       v += 36;
18586506fda5SShri Abhyankar     }
18596506fda5SShri Abhyankar     idc = 6*c[i];
18606506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
18616506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
18626506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
18636506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
18646506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
18656506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
18666506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
18676506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
18686506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
18696506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
18706506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
18716506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
18726506fda5SShri Abhyankar   }
18736506fda5SShri Abhyankar 
18746506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18756506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18766506fda5SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18776506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
18786506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
18796506fda5SShri Abhyankar   PetscFunctionReturn(0);
18806506fda5SShri Abhyankar }
18818f690400SShri Abhyankar 
18828f690400SShri Abhyankar #undef __FUNCT__
18834a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1884dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
188515091d37SBarry Smith {
188615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1887690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1888dfbe8321SBarry Smith   PetscErrorCode    ierr;
1889690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1890d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1891d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1892d9fead3dSBarry Smith   const PetscScalar *b;
189315091d37SBarry Smith 
189415091d37SBarry Smith   PetscFunctionBegin;
1895d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18961ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189715091d37SBarry Smith   /* forward solve the lower triangular */
189815091d37SBarry Smith   idx    = 0;
189915091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
190015091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
190115091d37SBarry Smith   for (i=1; i<n; i++) {
190215091d37SBarry Smith     v     =  aa + 36*ai[i];
190315091d37SBarry Smith     vi    =  aj + ai[i];
190415091d37SBarry Smith     nz    =  diag[i] - ai[i];
190515091d37SBarry Smith     idx   =  6*i;
1906f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1907f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
190815091d37SBarry Smith     while (nz--) {
190915091d37SBarry Smith       jdx   = 6*(*vi++);
191015091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
191115091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1912f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1913f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1914f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1915f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1916f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1917f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
191815091d37SBarry Smith       v += 36;
191915091d37SBarry Smith      }
1920f1af5d2fSBarry Smith     x[idx]   = s1;
1921f1af5d2fSBarry Smith     x[1+idx] = s2;
1922f1af5d2fSBarry Smith     x[2+idx] = s3;
1923f1af5d2fSBarry Smith     x[3+idx] = s4;
1924f1af5d2fSBarry Smith     x[4+idx] = s5;
1925f1af5d2fSBarry Smith     x[5+idx] = s6;
192615091d37SBarry Smith   }
192715091d37SBarry Smith   /* backward solve the upper triangular */
192815091d37SBarry Smith   for (i=n-1; i>=0; i--){
192915091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
193015091d37SBarry Smith     vi   = aj + diag[i] + 1;
193115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
193215091d37SBarry Smith     idt  = 6*i;
1933f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1934f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1935f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
193615091d37SBarry Smith     while (nz--) {
193715091d37SBarry Smith       idx   = 6*(*vi++);
193815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
193915091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1940f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1941f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1942f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1943f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1944f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1945f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
194615091d37SBarry Smith       v += 36;
194715091d37SBarry Smith     }
194815091d37SBarry Smith     v        = aa + 36*diag[i];
1949f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1950f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1951f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1952f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1953f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1954f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
195515091d37SBarry Smith   }
195615091d37SBarry Smith 
1957d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1959dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
196015091d37SBarry Smith   PetscFunctionReturn(0);
196115091d37SBarry Smith }
196215091d37SBarry Smith 
1963cee9d6f2SShri Abhyankar #undef __FUNCT__
1964a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1965a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
196653cca76cSShri Abhyankar {
196753cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
196853cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
196953cca76cSShri Abhyankar     PetscErrorCode    ierr;
197053cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
197153cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
197253cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
197353cca76cSShri Abhyankar     PetscScalar       *x;
197453cca76cSShri Abhyankar     const PetscScalar *b;
197553cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
197653cca76cSShri Abhyankar 
197753cca76cSShri Abhyankar     PetscFunctionBegin;
197853cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
197953cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
198053cca76cSShri Abhyankar     /* forward solve the lower triangular */
198153cca76cSShri Abhyankar     idx    = 0;
198253cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
198353cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
198453cca76cSShri Abhyankar     for (i=1; i<n; i++) {
198553cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
198653cca76cSShri Abhyankar        vi   = aj + ai[i];
198753cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
198853cca76cSShri Abhyankar       idx   = bs*i;
198953cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
199053cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
199153cca76cSShri Abhyankar        for(k=0;k<nz;k++){
199253cca76cSShri Abhyankar           jdx   = bs*vi[k];
199353cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
199453cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
199553cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
199653cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
199753cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
199853cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
199953cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
200053cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
200153cca76cSShri Abhyankar           v   +=  bs2;
200253cca76cSShri Abhyankar         }
200353cca76cSShri Abhyankar 
200453cca76cSShri Abhyankar        x[idx]   = s1;
200553cca76cSShri Abhyankar        x[1+idx] = s2;
200653cca76cSShri Abhyankar        x[2+idx] = s3;
200753cca76cSShri Abhyankar        x[3+idx] = s4;
200853cca76cSShri Abhyankar        x[4+idx] = s5;
200953cca76cSShri Abhyankar        x[5+idx] = s6;
201053cca76cSShri Abhyankar     }
201153cca76cSShri Abhyankar 
201253cca76cSShri Abhyankar    /* backward solve the upper triangular */
201353cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
201453cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
201553cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
201653cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
201753cca76cSShri Abhyankar      idt = bs*i;
201853cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
201953cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
202053cca76cSShri Abhyankar      for(k=0;k<nz;k++){
202153cca76cSShri Abhyankar       idx   = bs*vi[k];
202253cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
202353cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
202453cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
202553cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
202653cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
202753cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
202853cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
202953cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
203053cca76cSShri Abhyankar         v   +=  bs2;
203153cca76cSShri Abhyankar     }
203253cca76cSShri Abhyankar     /* x = inv_diagonal*x */
203353cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
203453cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
203553cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
203653cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
203753cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
203853cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
203953cca76cSShri Abhyankar   }
204053cca76cSShri Abhyankar 
204153cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
204253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
204353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
204453cca76cSShri Abhyankar   PetscFunctionReturn(0);
204553cca76cSShri Abhyankar }
204653cca76cSShri Abhyankar 
204753cca76cSShri Abhyankar #undef __FUNCT__
20484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2049dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
20504e2b4712SSatish Balay {
20514e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
20524e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
20536849ba73SBarry Smith   PetscErrorCode    ierr;
20545d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
20555d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2056d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2057d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2058d9fead3dSBarry Smith   const PetscScalar *b;
20594e2b4712SSatish Balay 
20604e2b4712SSatish Balay   PetscFunctionBegin;
2061d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20621ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2063f1af5d2fSBarry Smith   t  = a->solve_work;
20644e2b4712SSatish Balay 
20654e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
20664e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
20674e2b4712SSatish Balay 
20684e2b4712SSatish Balay   /* forward solve the lower triangular */
20694e2b4712SSatish Balay   idx    = 5*(*r++);
2070f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2071f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
20724e2b4712SSatish Balay   for (i=1; i<n; i++) {
20734e2b4712SSatish Balay     v     = aa + 25*ai[i];
20744e2b4712SSatish Balay     vi    = aj + ai[i];
20754e2b4712SSatish Balay     nz    = diag[i] - ai[i];
20764e2b4712SSatish Balay     idx   = 5*(*r++);
2077f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2078f1af5d2fSBarry Smith     s5  = b[4+idx];
20794e2b4712SSatish Balay     while (nz--) {
20804e2b4712SSatish Balay       idx   = 5*(*vi++);
2081f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2082f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2083f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2084f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2085f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2086f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2087f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
20884e2b4712SSatish Balay       v += 25;
20894e2b4712SSatish Balay     }
20904e2b4712SSatish Balay     idx = 5*i;
2091f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2092f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
20934e2b4712SSatish Balay   }
20944e2b4712SSatish Balay   /* backward solve the upper triangular */
20954e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
20964e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
20974e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
20984e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
20994e2b4712SSatish Balay     idt  = 5*i;
2100f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2101f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
21024e2b4712SSatish Balay     while (nz--) {
21034e2b4712SSatish Balay       idx   = 5*(*vi++);
2104f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2105f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2106f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2107f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2108f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2109f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2110f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
21114e2b4712SSatish Balay       v += 25;
21124e2b4712SSatish Balay     }
21134e2b4712SSatish Balay     idc = 5*(*c--);
21144e2b4712SSatish Balay     v   = aa + 25*diag[i];
2115f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2116f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
2117f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2118f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
2119f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2120f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
2121f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2122f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
2123f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2124f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
21254e2b4712SSatish Balay   }
21264e2b4712SSatish Balay 
21274e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21284e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2129d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21301ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2131dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
21324e2b4712SSatish Balay   PetscFunctionReturn(0);
21334e2b4712SSatish Balay }
21344e2b4712SSatish Balay 
213578bb4007SShri Abhyankar #undef __FUNCT__
2136a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2137a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
213878bb4007SShri Abhyankar {
213978bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
214078bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
214178bb4007SShri Abhyankar   PetscErrorCode    ierr;
214278bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
214378bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
214478bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
214578bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
214678bb4007SShri Abhyankar   const PetscScalar *b;
214778bb4007SShri Abhyankar 
214878bb4007SShri Abhyankar   PetscFunctionBegin;
214978bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
215078bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
215178bb4007SShri Abhyankar   t  = a->solve_work;
215278bb4007SShri Abhyankar 
215378bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
215478bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
215578bb4007SShri Abhyankar 
215678bb4007SShri Abhyankar   /* forward solve the lower triangular */
215778bb4007SShri Abhyankar   idx    = 5*r[0];
215878bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
215978bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
216078bb4007SShri Abhyankar   for (i=1; i<n; i++) {
216178bb4007SShri Abhyankar     v     = aa + 25*ai[i];
216278bb4007SShri Abhyankar     vi    = aj + ai[i];
216378bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
216478bb4007SShri Abhyankar     idx   = 5*r[i];
216578bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
216678bb4007SShri Abhyankar     s5  = b[4+idx];
216778bb4007SShri Abhyankar     for(m=0;m<nz;m++){
216878bb4007SShri Abhyankar       idx   = 5*vi[m];
216978bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
217078bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
217178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
217278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
217378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
217478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
217578bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
217678bb4007SShri Abhyankar       v += 25;
217778bb4007SShri Abhyankar     }
217878bb4007SShri Abhyankar     idx = 5*i;
217978bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
218078bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
218178bb4007SShri Abhyankar   }
218278bb4007SShri Abhyankar   /* backward solve the upper triangular */
218378bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
218478bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
218578bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
218678bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
218778bb4007SShri Abhyankar     idt  = 5*i;
218878bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
218978bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
219078bb4007SShri Abhyankar     for(m=0;m<nz;m++){
219178bb4007SShri Abhyankar       idx   = 5*vi[m];
219278bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
219378bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
219478bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
219578bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
219678bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
219778bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
219878bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
219978bb4007SShri Abhyankar       v += 25;
220078bb4007SShri Abhyankar     }
220178bb4007SShri Abhyankar     idc = 5*c[i];
220278bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
220378bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
220478bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
220578bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
220678bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
220778bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
220878bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
220978bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
221078bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
221178bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
221278bb4007SShri Abhyankar   }
221378bb4007SShri Abhyankar 
221478bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
221578bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
221678bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
221778bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
221878bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
221978bb4007SShri Abhyankar   PetscFunctionReturn(0);
222078bb4007SShri Abhyankar }
222178bb4007SShri Abhyankar 
22228f690400SShri Abhyankar #undef __FUNCT__
22234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2224dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
222515091d37SBarry Smith {
222615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2227690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2228dfbe8321SBarry Smith   PetscErrorCode    ierr;
2229690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2230d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2231d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2232d9fead3dSBarry Smith   const PetscScalar *b;
223315091d37SBarry Smith 
223415091d37SBarry Smith   PetscFunctionBegin;
2235d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22361ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
223715091d37SBarry Smith   /* forward solve the lower triangular */
223815091d37SBarry Smith   idx    = 0;
223915091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
224015091d37SBarry Smith   for (i=1; i<n; i++) {
224115091d37SBarry Smith     v     =  aa + 25*ai[i];
224215091d37SBarry Smith     vi    =  aj + ai[i];
224315091d37SBarry Smith     nz    =  diag[i] - ai[i];
224415091d37SBarry Smith     idx   =  5*i;
2245f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
224615091d37SBarry Smith     while (nz--) {
224715091d37SBarry Smith       jdx   = 5*(*vi++);
224815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2249f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2250f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2251f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2252f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2253f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
225415091d37SBarry Smith       v    += 25;
225515091d37SBarry Smith     }
2256f1af5d2fSBarry Smith     x[idx]   = s1;
2257f1af5d2fSBarry Smith     x[1+idx] = s2;
2258f1af5d2fSBarry Smith     x[2+idx] = s3;
2259f1af5d2fSBarry Smith     x[3+idx] = s4;
2260f1af5d2fSBarry Smith     x[4+idx] = s5;
226115091d37SBarry Smith   }
226215091d37SBarry Smith   /* backward solve the upper triangular */
226315091d37SBarry Smith   for (i=n-1; i>=0; i--){
226415091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
226515091d37SBarry Smith     vi   = aj + diag[i] + 1;
226615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
226715091d37SBarry Smith     idt  = 5*i;
2268f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2269f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
227015091d37SBarry Smith     while (nz--) {
227115091d37SBarry Smith       idx   = 5*(*vi++);
227215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2273f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2274f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2275f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2276f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2277f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
227815091d37SBarry Smith       v    += 25;
227915091d37SBarry Smith     }
228015091d37SBarry Smith     v        = aa + 25*diag[i];
2281f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2282f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2283f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2284f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2285f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
228615091d37SBarry Smith   }
228715091d37SBarry Smith 
2288d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2290dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
229115091d37SBarry Smith   PetscFunctionReturn(0);
229215091d37SBarry Smith }
229315091d37SBarry Smith 
2294cee9d6f2SShri Abhyankar #undef __FUNCT__
2295a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2296a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
229753cca76cSShri Abhyankar {
229853cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
229953cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
230053cca76cSShri Abhyankar   PetscErrorCode    ierr;
230153cca76cSShri Abhyankar   PetscInt          jdx;
230253cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
230353cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
230453cca76cSShri Abhyankar   const PetscScalar *b;
230553cca76cSShri Abhyankar 
230653cca76cSShri Abhyankar   PetscFunctionBegin;
230753cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
230853cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
230953cca76cSShri Abhyankar   /* forward solve the lower triangular */
231053cca76cSShri Abhyankar   idx    = 0;
231153cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
231253cca76cSShri Abhyankar   for (i=1; i<n; i++) {
231353cca76cSShri Abhyankar     v   = aa + 25*ai[i];
231453cca76cSShri Abhyankar     vi  = aj + ai[i];
231553cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
231653cca76cSShri Abhyankar     idx = 5*i;
231753cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
231853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
231953cca76cSShri Abhyankar       jdx   = 5*vi[k];
232053cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
232153cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
232253cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
232353cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
232453cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
232553cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
232653cca76cSShri Abhyankar       v    += 25;
232753cca76cSShri Abhyankar     }
232853cca76cSShri Abhyankar     x[idx]   = s1;
232953cca76cSShri Abhyankar     x[1+idx] = s2;
233053cca76cSShri Abhyankar     x[2+idx] = s3;
233153cca76cSShri Abhyankar     x[3+idx] = s4;
233253cca76cSShri Abhyankar     x[4+idx] = s5;
233353cca76cSShri Abhyankar   }
233453cca76cSShri Abhyankar 
233553cca76cSShri Abhyankar   /* backward solve the upper triangular */
233653cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
233753cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
233853cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
233953cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
234053cca76cSShri Abhyankar     idt = 5*i;
234153cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
234253cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
234353cca76cSShri Abhyankar     for(k=0;k<nz;k++){
234453cca76cSShri Abhyankar       idx   = 5*vi[k];
234553cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
234653cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
234753cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
234853cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
234953cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
235053cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
235153cca76cSShri Abhyankar       v    += 25;
235253cca76cSShri Abhyankar     }
235353cca76cSShri Abhyankar     /* x = inv_diagonal*x */
235453cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
235553cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
235653cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
235753cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
235853cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
235953cca76cSShri Abhyankar   }
236053cca76cSShri Abhyankar 
236153cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
236253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
236353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
236453cca76cSShri Abhyankar   PetscFunctionReturn(0);
236553cca76cSShri Abhyankar }
236653cca76cSShri Abhyankar 
236753cca76cSShri Abhyankar #undef __FUNCT__
23684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2369dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
23704e2b4712SSatish Balay {
23714e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
23724e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
23736849ba73SBarry Smith   PetscErrorCode    ierr;
23745d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
23755d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2376d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2377d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2378d9fead3dSBarry Smith   const PetscScalar *b;
23794e2b4712SSatish Balay 
23804e2b4712SSatish Balay   PetscFunctionBegin;
2381d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2383f1af5d2fSBarry Smith   t  = a->solve_work;
23844e2b4712SSatish Balay 
23854e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
23864e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
23874e2b4712SSatish Balay 
23884e2b4712SSatish Balay   /* forward solve the lower triangular */
23894e2b4712SSatish Balay   idx    = 4*(*r++);
2390f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2391f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
23924e2b4712SSatish Balay   for (i=1; i<n; i++) {
23934e2b4712SSatish Balay     v     = aa + 16*ai[i];
23944e2b4712SSatish Balay     vi    = aj + ai[i];
23954e2b4712SSatish Balay     nz    = diag[i] - ai[i];
23964e2b4712SSatish Balay     idx   = 4*(*r++);
2397f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
23984e2b4712SSatish Balay     while (nz--) {
23994e2b4712SSatish Balay       idx   = 4*(*vi++);
2400f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2401f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2402f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2403f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2404f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
24054e2b4712SSatish Balay       v    += 16;
24064e2b4712SSatish Balay     }
24074e2b4712SSatish Balay     idx        = 4*i;
2408f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2409f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
24104e2b4712SSatish Balay   }
24114e2b4712SSatish Balay   /* backward solve the upper triangular */
24124e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
24134e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
24144e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
24154e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
24164e2b4712SSatish Balay     idt  = 4*i;
2417f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2418f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
24194e2b4712SSatish Balay     while (nz--) {
24204e2b4712SSatish Balay       idx   = 4*(*vi++);
2421f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2422f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2423f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2424f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2425f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2426f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
24274e2b4712SSatish Balay       v += 16;
24284e2b4712SSatish Balay     }
24294e2b4712SSatish Balay     idc      = 4*(*c--);
24304e2b4712SSatish Balay     v        = aa + 16*diag[i];
2431f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2432f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2433f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2434f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
24354e2b4712SSatish Balay   }
24364e2b4712SSatish Balay 
24374e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
24384e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2439d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2441dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
24424e2b4712SSatish Balay   PetscFunctionReturn(0);
24434e2b4712SSatish Balay }
2444f26ec98cSKris Buschelman 
24458f690400SShri Abhyankar #undef __FUNCT__
2446a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2447a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
244878bb4007SShri Abhyankar {
244978bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
245078bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
245178bb4007SShri Abhyankar   PetscErrorCode    ierr;
245278bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
245378bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
245478bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
245578bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
245678bb4007SShri Abhyankar   const PetscScalar *b;
245778bb4007SShri Abhyankar 
245878bb4007SShri Abhyankar   PetscFunctionBegin;
245978bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
246078bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
246178bb4007SShri Abhyankar   t  = a->solve_work;
246278bb4007SShri Abhyankar 
246378bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
246478bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
246578bb4007SShri Abhyankar 
246678bb4007SShri Abhyankar   /* forward solve the lower triangular */
246778bb4007SShri Abhyankar   idx    = 4*r[0];
246878bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
246978bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
247078bb4007SShri Abhyankar   for (i=1; i<n; i++) {
247178bb4007SShri Abhyankar     v     = aa + 16*ai[i];
247278bb4007SShri Abhyankar     vi    = aj + ai[i];
247378bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
247478bb4007SShri Abhyankar     idx   = 4*r[i];
247578bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
247678bb4007SShri Abhyankar     for(m=0;m<nz;m++){
247778bb4007SShri Abhyankar       idx   = 4*vi[m];
247878bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
247978bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
248078bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
248178bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
248278bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
248378bb4007SShri Abhyankar       v    += 16;
248478bb4007SShri Abhyankar     }
248578bb4007SShri Abhyankar     idx        = 4*i;
248678bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
248778bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
248878bb4007SShri Abhyankar   }
248978bb4007SShri Abhyankar   /* backward solve the upper triangular */
249078bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
249178bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
249278bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
249378bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
249478bb4007SShri Abhyankar     idt  = 4*i;
249578bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
249678bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
249778bb4007SShri Abhyankar     for(m=0;m<nz;m++){
249878bb4007SShri Abhyankar       idx   = 4*vi[m];
249978bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
250078bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
250178bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
250278bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
250378bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
250478bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
250578bb4007SShri Abhyankar       v += 16;
250678bb4007SShri Abhyankar     }
250778bb4007SShri Abhyankar     idc      = 4*c[i];
250878bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
250978bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
251078bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
251178bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
251278bb4007SShri Abhyankar   }
251378bb4007SShri Abhyankar 
251478bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
251578bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
251678bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
251778bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
251878bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
251978bb4007SShri Abhyankar   PetscFunctionReturn(0);
252078bb4007SShri Abhyankar }
252178bb4007SShri Abhyankar 
252278bb4007SShri Abhyankar #undef __FUNCT__
2523f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2524dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2525f26ec98cSKris Buschelman {
2526f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2527f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
25286849ba73SBarry Smith   PetscErrorCode    ierr;
25295d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
25305d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2531d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2532d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2533d9fead3dSBarry Smith   PetscScalar       *x;
2534d9fead3dSBarry Smith   const PetscScalar *b;
2535f26ec98cSKris Buschelman 
2536f26ec98cSKris Buschelman   PetscFunctionBegin;
2537d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25381ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2539f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2540f26ec98cSKris Buschelman 
2541f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2542f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2543f26ec98cSKris Buschelman 
2544f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2545f26ec98cSKris Buschelman   idx    = 4*(*r++);
2546f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2547f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2548f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2549f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2550f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2551f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2552f26ec98cSKris Buschelman     vi    = aj + ai[i];
2553f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2554f26ec98cSKris Buschelman     idx   = 4*(*r++);
2555f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2556f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2557f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2558f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2559f26ec98cSKris Buschelman     while (nz--) {
2560f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2561f26ec98cSKris Buschelman       x1  = t[idx];
2562f26ec98cSKris Buschelman       x2  = t[1+idx];
2563f26ec98cSKris Buschelman       x3  = t[2+idx];
2564f26ec98cSKris Buschelman       x4  = t[3+idx];
2565f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2566f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2567f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2568f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2569f26ec98cSKris Buschelman       v    += 16;
2570f26ec98cSKris Buschelman     }
2571f26ec98cSKris Buschelman     idx        = 4*i;
2572f26ec98cSKris Buschelman     t[idx]   = s1;
2573f26ec98cSKris Buschelman     t[1+idx] = s2;
2574f26ec98cSKris Buschelman     t[2+idx] = s3;
2575f26ec98cSKris Buschelman     t[3+idx] = s4;
2576f26ec98cSKris Buschelman   }
2577f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2578f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2579f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2580f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2581f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2582f26ec98cSKris Buschelman     idt  = 4*i;
2583f26ec98cSKris Buschelman     s1 = t[idt];
2584f26ec98cSKris Buschelman     s2 = t[1+idt];
2585f26ec98cSKris Buschelman     s3 = t[2+idt];
2586f26ec98cSKris Buschelman     s4 = t[3+idt];
2587f26ec98cSKris Buschelman     while (nz--) {
2588f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2589f26ec98cSKris Buschelman       x1  = t[idx];
2590f26ec98cSKris Buschelman       x2  = t[1+idx];
2591f26ec98cSKris Buschelman       x3  = t[2+idx];
2592f26ec98cSKris Buschelman       x4  = t[3+idx];
2593f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2594f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2595f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2596f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2597f26ec98cSKris Buschelman       v += 16;
2598f26ec98cSKris Buschelman     }
2599f26ec98cSKris Buschelman     idc      = 4*(*c--);
2600f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
2601f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2602f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2603f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2604f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2605f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2606f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2607f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2608f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2609f26ec98cSKris Buschelman  }
2610f26ec98cSKris Buschelman 
2611f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2612f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2613d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2615dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2616f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2617f26ec98cSKris Buschelman }
2618f26ec98cSKris Buschelman 
261924c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
262024c233c2SKris Buschelman 
262124c233c2SKris Buschelman #include PETSC_HAVE_SSE
262224c233c2SKris Buschelman 
262324c233c2SKris Buschelman #undef __FUNCT__
262424c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2625dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
262624c233c2SKris Buschelman {
262724c233c2SKris Buschelman   /*
262824c233c2SKris Buschelman      Note: This code uses demotion of double
262924c233c2SKris Buschelman      to float when performing the mixed-mode computation.
263024c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
263124c233c2SKris Buschelman   */
263224c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
263324c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
26346849ba73SBarry Smith   PetscErrorCode ierr;
26355d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
26365d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
263724c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
263887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
263924c233c2SKris Buschelman 
264024c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
264124c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
264224c233c2SKris Buschelman   unsigned long   offset;
264324c233c2SKris Buschelman 
264424c233c2SKris Buschelman   PetscFunctionBegin;
264524c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
264624c233c2SKris Buschelman 
264724c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
264824c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
264924c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
265024c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
265124c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
265224c233c2SKris Buschelman 
26531ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26541ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
265524c233c2SKris Buschelman     t  = a->solve_work;
265624c233c2SKris Buschelman 
265724c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
265824c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
265924c233c2SKris Buschelman 
266024c233c2SKris Buschelman     /* forward solve the lower triangular */
266124c233c2SKris Buschelman     idx  = 4*(*r++);
266224c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
266324c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
266424c233c2SKris Buschelman     v    =  aa + 16*ai[1];
266524c233c2SKris Buschelman 
266624c233c2SKris Buschelman     for (i=1; i<n;) {
266724c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
266824c233c2SKris Buschelman       vi   =  aj      + ai[i];
266924c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
267024c233c2SKris Buschelman       idx  =  4*(*r++);
267124c233c2SKris Buschelman 
267224c233c2SKris Buschelman       /* Demote sum from double to float */
267324c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
267424c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
267524c233c2SKris Buschelman 
267624c233c2SKris Buschelman       while (nz--) {
267724c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
267824c233c2SKris Buschelman         idx = 4*(*vi++);
267924c233c2SKris Buschelman 
268024c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
268124c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
268224c233c2SKris Buschelman 
268324c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
268424c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
268524c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
268624c233c2SKris Buschelman 
268724c233c2SKris Buschelman           /* First Column */
268824c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
268924c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
269024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
269124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
269224c233c2SKris Buschelman 
269324c233c2SKris Buschelman           /* Second Column */
269424c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
269524c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
269624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
269724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
269824c233c2SKris Buschelman 
269924c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
270024c233c2SKris Buschelman 
270124c233c2SKris Buschelman           /* Third Column */
270224c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
270324c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
270424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
270524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
270624c233c2SKris Buschelman 
270724c233c2SKris Buschelman           /* Fourth Column */
270824c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
270924c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
271024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
271124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
271224c233c2SKris Buschelman         SSE_INLINE_END_2
271324c233c2SKris Buschelman 
271424c233c2SKris Buschelman         v  += 16;
271524c233c2SKris Buschelman       }
271624c233c2SKris Buschelman       idx = 4*i;
271724c233c2SKris Buschelman       v   = aa + 16*ai[++i];
271824c233c2SKris Buschelman       PREFETCH_NTA(v);
271924c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
272024c233c2SKris Buschelman 
272124c233c2SKris Buschelman       /* Promote result from float to double */
272224c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
272324c233c2SKris Buschelman     }
272424c233c2SKris Buschelman     /* backward solve the upper triangular */
272524c233c2SKris Buschelman     idt  = 4*(n-1);
272624c233c2SKris Buschelman     ai16 = 16*diag[n-1];
272724c233c2SKris Buschelman     v    = aa + ai16 + 16;
272824c233c2SKris Buschelman     for (i=n-1; i>=0;){
272924c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
273024c233c2SKris Buschelman       vi = aj + diag[i] + 1;
273124c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
273224c233c2SKris Buschelman 
273324c233c2SKris Buschelman       /* Demote accumulator from double to float */
273424c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
273524c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
273624c233c2SKris Buschelman 
273724c233c2SKris Buschelman       while (nz--) {
273824c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
273924c233c2SKris Buschelman         idx = 4*(*vi++);
274024c233c2SKris Buschelman 
274124c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
274224c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
274324c233c2SKris Buschelman 
274424c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
274524c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
274624c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
274724c233c2SKris Buschelman 
274824c233c2SKris Buschelman           /* First Column */
274924c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
275024c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
275124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
275224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
275324c233c2SKris Buschelman 
275424c233c2SKris Buschelman           /* Second Column */
275524c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
275624c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
275724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
275824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
275924c233c2SKris Buschelman 
276024c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
276124c233c2SKris Buschelman 
276224c233c2SKris Buschelman           /* Third Column */
276324c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
276424c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
276524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
276624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
276724c233c2SKris Buschelman 
276824c233c2SKris Buschelman           /* Fourth Column */
276924c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
277024c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
277124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
277224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
277324c233c2SKris Buschelman         SSE_INLINE_END_2
277424c233c2SKris Buschelman         v  += 16;
277524c233c2SKris Buschelman       }
277624c233c2SKris Buschelman       v    = aa + ai16;
277724c233c2SKris Buschelman       ai16 = 16*diag[--i];
277824c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
277924c233c2SKris Buschelman       /*
278024c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
278124c233c2SKris Buschelman          which was inverted as part of the factorization
278224c233c2SKris Buschelman       */
278324c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
278424c233c2SKris Buschelman         /* First Column */
278524c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
278624c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
278724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
278824c233c2SKris Buschelman 
278924c233c2SKris Buschelman         /* Second Column */
279024c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
279124c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
279224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
279324c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
279424c233c2SKris Buschelman 
279524c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
279624c233c2SKris Buschelman 
279724c233c2SKris Buschelman         /* Third Column */
279824c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
279924c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
280024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
280124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
280224c233c2SKris Buschelman 
280324c233c2SKris Buschelman         /* Fourth Column */
280424c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
280524c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
280624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
280724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
280824c233c2SKris Buschelman 
280924c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
281024c233c2SKris Buschelman       SSE_INLINE_END_3
281124c233c2SKris Buschelman 
281224c233c2SKris Buschelman       /* Promote solution from float to double */
281324c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
281424c233c2SKris Buschelman 
281524c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
281624c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
281724c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
281824c233c2SKris Buschelman       idc  = 4*(*c--);
281924c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
282024c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
282124c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
282224c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
282324c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
282424c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
282524c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
282624c233c2SKris Buschelman       SSE_INLINE_END_2
282724c233c2SKris Buschelman       v    = aa + ai16 + 16;
282824c233c2SKris Buschelman       idt -= 4;
282924c233c2SKris Buschelman     }
283024c233c2SKris Buschelman 
283124c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
283224c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
28331ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
28341ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2835dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
283624c233c2SKris Buschelman   SSE_SCOPE_END;
283724c233c2SKris Buschelman   PetscFunctionReturn(0);
283824c233c2SKris Buschelman }
283924c233c2SKris Buschelman 
284024c233c2SKris Buschelman #endif
28410ef38995SBarry Smith 
28420ef38995SBarry Smith 
28434e2b4712SSatish Balay /*
28444e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
28454e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
28464e2b4712SSatish Balay */
28474a2ae208SSatish Balay #undef __FUNCT__
28484a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2849dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
28504e2b4712SSatish Balay {
28514e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2852356650c2SBarry Smith   PetscInt          n=a->mbs;
2853356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2854dfbe8321SBarry Smith   PetscErrorCode    ierr;
2855356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2856d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2857d9fead3dSBarry Smith   PetscScalar       *x;
2858d9fead3dSBarry Smith   const PetscScalar *b;
28594e2b4712SSatish Balay 
28604e2b4712SSatish Balay   PetscFunctionBegin;
2861d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28621ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28634e2b4712SSatish Balay 
2864aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
28652853dc0eSBarry Smith   {
286687828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
28672853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
28682853dc0eSBarry Smith   }
2869aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
28702853dc0eSBarry Smith   {
287187828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
28722853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
28732853dc0eSBarry Smith   }
2874aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
28752853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2876e1293385SBarry Smith #else
287730d4dcafSBarry Smith   {
287887828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2879d9fead3dSBarry Smith     const MatScalar *v;
2880356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2881356650c2SBarry Smith     const PetscInt  *vi;
2882e1293385SBarry Smith 
28834e2b4712SSatish Balay   /* forward solve the lower triangular */
28844e2b4712SSatish Balay   idx    = 0;
2885e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
28864e2b4712SSatish Balay   for (i=1; i<n; i++) {
28874e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
28884e2b4712SSatish Balay     vi    =  aj      + ai[i];
28894e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2890e1293385SBarry Smith     idx   +=  4;
2891f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
28924e2b4712SSatish Balay     while (nz--) {
28934e2b4712SSatish Balay       jdx   = 4*(*vi++);
28944e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2895f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2896f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2897f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2898f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
28994e2b4712SSatish Balay       v    += 16;
29004e2b4712SSatish Balay     }
2901f1af5d2fSBarry Smith     x[idx]   = s1;
2902f1af5d2fSBarry Smith     x[1+idx] = s2;
2903f1af5d2fSBarry Smith     x[2+idx] = s3;
2904f1af5d2fSBarry Smith     x[3+idx] = s4;
29054e2b4712SSatish Balay   }
29064e2b4712SSatish Balay   /* backward solve the upper triangular */
29074e555682SBarry Smith   idt = 4*(n-1);
29084e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
29094e555682SBarry Smith     ai16 = 16*diag[i];
29104e555682SBarry Smith     v    = aa + ai16 + 16;
29114e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
29124e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2913f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2914f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
29154e2b4712SSatish Balay     while (nz--) {
29164e2b4712SSatish Balay       idx   = 4*(*vi++);
29174e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2918f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2919f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2920f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2921f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
29224e2b4712SSatish Balay       v    += 16;
29234e2b4712SSatish Balay     }
29244e555682SBarry Smith     v        = aa + ai16;
2925f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2926f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2927f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2928f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2929329f5518SBarry Smith     idt -= 4;
29304e2b4712SSatish Balay   }
293130d4dcafSBarry Smith   }
2932e1293385SBarry Smith #endif
29334e2b4712SSatish Balay 
2934d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29351ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2936dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
29374e2b4712SSatish Balay   PetscFunctionReturn(0);
29384e2b4712SSatish Balay }
29394e2b4712SSatish Balay 
2940b2b2dd24SShri Abhyankar #undef __FUNCT__
2941a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2942a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2943b2b2dd24SShri Abhyankar {
2944b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2945b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
2946b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
2947b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
2948b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2949b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2950b2b2dd24SShri Abhyankar     PetscScalar       *x;
2951b2b2dd24SShri Abhyankar     const PetscScalar *b;
2952b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2953cee9d6f2SShri Abhyankar 
2954b2b2dd24SShri Abhyankar     PetscFunctionBegin;
2955b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2956b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2957b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
2958b2b2dd24SShri Abhyankar     idx    = 0;
2959b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2960b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
2961b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
2962b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
2963b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
2964b2b2dd24SShri Abhyankar       idx   = bs*i;
2965b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2966b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
2967b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
2968b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2969b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2970b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2971b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2972b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2973b2b2dd24SShri Abhyankar 
2974b2b2dd24SShri Abhyankar           v   +=  bs2;
2975b2b2dd24SShri Abhyankar         }
2976b2b2dd24SShri Abhyankar 
2977b2b2dd24SShri Abhyankar        x[idx]   = s1;
2978b2b2dd24SShri Abhyankar        x[1+idx] = s2;
2979b2b2dd24SShri Abhyankar        x[2+idx] = s3;
2980b2b2dd24SShri Abhyankar        x[3+idx] = s4;
2981b2b2dd24SShri Abhyankar     }
2982b2b2dd24SShri Abhyankar 
2983b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
2984b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
2985b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
2986b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
2987b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
2988b2b2dd24SShri Abhyankar      idt = bs*i;
2989b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2990b2b2dd24SShri Abhyankar 
2991b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
2992b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
2993b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2994b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2995b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2996b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2997b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2998b2b2dd24SShri Abhyankar 
2999b2b2dd24SShri Abhyankar         v   +=  bs2;
3000b2b2dd24SShri Abhyankar     }
3001b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3002b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3003b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3004b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3005b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3006b2b2dd24SShri Abhyankar 
3007b2b2dd24SShri Abhyankar   }
3008b2b2dd24SShri Abhyankar 
3009b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3010b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3011b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3012b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3013b2b2dd24SShri Abhyankar }
3014cee9d6f2SShri Abhyankar 
3015cee9d6f2SShri Abhyankar #undef __FUNCT__
3016f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3017dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3018f26ec98cSKris Buschelman {
3019f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3020690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3021dfbe8321SBarry Smith   PetscErrorCode ierr;
3022690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3023f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3024f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3025f26ec98cSKris Buschelman 
3026f26ec98cSKris Buschelman   PetscFunctionBegin;
30271ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30281ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3029f26ec98cSKris Buschelman 
3030f26ec98cSKris Buschelman   {
3031f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3032f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3033690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3034f26ec98cSKris Buschelman 
3035f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3036f26ec98cSKris Buschelman     idx  = 0;
3037f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3038f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3039f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3040f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3041f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3042f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3043f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3044f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3045f26ec98cSKris Buschelman       idx   +=  4;
3046f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3047f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3048f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3049f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3050f26ec98cSKris Buschelman       while (nz--) {
3051f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3052f26ec98cSKris Buschelman         x1  = t[jdx];
3053f26ec98cSKris Buschelman         x2  = t[1+jdx];
3054f26ec98cSKris Buschelman         x3  = t[2+jdx];
3055f26ec98cSKris Buschelman         x4  = t[3+jdx];
3056f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3057f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3058f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3059f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3060f26ec98cSKris Buschelman         v    += 16;
3061f26ec98cSKris Buschelman       }
3062f26ec98cSKris Buschelman       t[idx]   = s1;
3063f26ec98cSKris Buschelman       t[1+idx] = s2;
3064f26ec98cSKris Buschelman       t[2+idx] = s3;
3065f26ec98cSKris Buschelman       t[3+idx] = s4;
3066f26ec98cSKris Buschelman     }
3067f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3068f26ec98cSKris Buschelman     idt = 4*(n-1);
3069f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3070f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3071f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3072f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3073f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3074f26ec98cSKris Buschelman       s1   = t[idt];
3075f26ec98cSKris Buschelman       s2   = t[1+idt];
3076f26ec98cSKris Buschelman       s3   = t[2+idt];
3077f26ec98cSKris Buschelman       s4   = t[3+idt];
3078f26ec98cSKris Buschelman       while (nz--) {
3079f26ec98cSKris Buschelman         idx = 4*(*vi++);
3080f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3081f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3082f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3083f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3084f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3085f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3086f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3087f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3088f26ec98cSKris Buschelman         v    += 16;
3089f26ec98cSKris Buschelman       }
3090f26ec98cSKris Buschelman       v        = aa + ai16;
3091f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3092f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3093f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3094f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3095f26ec98cSKris Buschelman       idt -= 4;
3096f26ec98cSKris Buschelman     }
3097f26ec98cSKris Buschelman   }
3098f26ec98cSKris Buschelman 
30991ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
31001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3101dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3102f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3103f26ec98cSKris Buschelman }
3104f26ec98cSKris Buschelman 
31053660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
31063660e330SKris Buschelman 
31073660e330SKris Buschelman #include PETSC_HAVE_SSE
31083660e330SKris Buschelman #undef __FUNCT__
31097cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3110dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
31113660e330SKris Buschelman {
31123660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
31132aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3114dfbe8321SBarry Smith   PetscErrorCode ierr;
3115dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
31163660e330SKris Buschelman   MatScalar      *aa=a->a;
311787828ca2SBarry Smith   PetscScalar    *x,*b;
31183660e330SKris Buschelman 
31193660e330SKris Buschelman   PetscFunctionBegin;
31203660e330SKris Buschelman   SSE_SCOPE_BEGIN;
31213660e330SKris Buschelman   /*
31223660e330SKris Buschelman      Note: This code currently uses demotion of double
31233660e330SKris Buschelman      to float when performing the mixed-mode computation.
31243660e330SKris Buschelman      This may not be numerically reasonable for all applications.
31253660e330SKris Buschelman   */
31263660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
31273660e330SKris Buschelman 
31281ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
31291ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
31303660e330SKris Buschelman   {
3131eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3132eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
31332aa5897fSKris Buschelman     int            nz,i,idt,ai16;
31342aa5897fSKris Buschelman     unsigned int   jdx,idx;
31352aa5897fSKris Buschelman     unsigned short *vi;
3136eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
31373660e330SKris Buschelman 
3138eb05f457SKris Buschelman     /* First block is the identity. */
31393660e330SKris Buschelman     idx  = 0;
3140eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
31412aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
31423660e330SKris Buschelman 
31433660e330SKris Buschelman     for (i=1; i<n;) {
31443660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
31453660e330SKris Buschelman       vi   =  aj      + ai[i];
31463660e330SKris Buschelman       nz   =  diag[i] - ai[i];
31473660e330SKris Buschelman       idx +=  4;
31483660e330SKris Buschelman 
3149eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3150eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3151eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
31523660e330SKris Buschelman 
31533660e330SKris Buschelman       while (nz--) {
31543660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
31552aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
31563660e330SKris Buschelman 
31573660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3158eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
31593660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
31603660e330SKris Buschelman 
31613660e330SKris Buschelman           /* First Column */
31623660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
31633660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
31643660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
31653660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
31663660e330SKris Buschelman 
31673660e330SKris Buschelman           /* Second Column */
31683660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
31693660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
31703660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
31713660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
31723660e330SKris Buschelman 
31733660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
31743660e330SKris Buschelman 
31753660e330SKris Buschelman           /* Third Column */
31763660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
31773660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
31783660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
31793660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
31803660e330SKris Buschelman 
31813660e330SKris Buschelman           /* Fourth Column */
31823660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
31833660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
31843660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
31853660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
31863660e330SKris Buschelman         SSE_INLINE_END_2
31873660e330SKris Buschelman 
31883660e330SKris Buschelman         v  += 16;
31893660e330SKris Buschelman       }
31903660e330SKris Buschelman       v    =  aa + 16*ai[++i];
31913660e330SKris Buschelman       PREFETCH_NTA(v);
3192eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
31933660e330SKris Buschelman     }
3194eb05f457SKris Buschelman 
3195eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3196eb05f457SKris Buschelman 
31973660e330SKris Buschelman     idt  = 4*(n-1);
31983660e330SKris Buschelman     ai16 = 16*diag[n-1];
31993660e330SKris Buschelman     v    = aa + ai16 + 16;
32003660e330SKris Buschelman     for (i=n-1; i>=0;){
32013660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
32023660e330SKris Buschelman       vi = aj + diag[i] + 1;
32033660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
32043660e330SKris Buschelman 
3205eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
32063660e330SKris Buschelman 
32073660e330SKris Buschelman       while (nz--) {
32083660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
32092aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
32103660e330SKris Buschelman 
32113660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3212eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
32133660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
32143660e330SKris Buschelman 
32153660e330SKris Buschelman           /* First Column */
32163660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
32173660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
32183660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
32193660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
32203660e330SKris Buschelman 
32213660e330SKris Buschelman           /* Second Column */
32223660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
32233660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
32243660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
32253660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
32263660e330SKris Buschelman 
32273660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
32283660e330SKris Buschelman 
32293660e330SKris Buschelman           /* Third Column */
32303660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
32313660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
32323660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
32333660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
32343660e330SKris Buschelman 
32353660e330SKris Buschelman           /* Fourth Column */
32363660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
32373660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
32383660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
32393660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
32403660e330SKris Buschelman         SSE_INLINE_END_2
32413660e330SKris Buschelman         v  += 16;
32423660e330SKris Buschelman       }
32433660e330SKris Buschelman       v    = aa + ai16;
32443660e330SKris Buschelman       ai16 = 16*diag[--i];
32453660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
32463660e330SKris Buschelman       /*
32473660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
32483660e330SKris Buschelman          which was inverted as part of the factorization
32493660e330SKris Buschelman       */
3250eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
32513660e330SKris Buschelman         /* First Column */
32523660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
32533660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
32543660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
32553660e330SKris Buschelman 
32563660e330SKris Buschelman         /* Second Column */
32573660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
32583660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
32593660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
32603660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
32613660e330SKris Buschelman 
32623660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
32633660e330SKris Buschelman 
32643660e330SKris Buschelman         /* Third Column */
32653660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
32663660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
32673660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
32683660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
32693660e330SKris Buschelman 
32703660e330SKris Buschelman         /* Fourth Column */
32713660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
32723660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
32733660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
32743660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
32753660e330SKris Buschelman 
32763660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
32773660e330SKris Buschelman       SSE_INLINE_END_3
32783660e330SKris Buschelman 
32793660e330SKris Buschelman       v    = aa + ai16 + 16;
32803660e330SKris Buschelman       idt -= 4;
32813660e330SKris Buschelman     }
3282eb05f457SKris Buschelman 
3283eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3284eb05f457SKris Buschelman     idt = 4*(n-1);
3285eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3286eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3287eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3288eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3289eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3290eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3291eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3292eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3293eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
329454693613SKris Buschelman       idt -= 4;
32953660e330SKris Buschelman     }
3296eb05f457SKris Buschelman 
3297eb05f457SKris Buschelman   } /* End of artificial scope. */
32981ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
32991ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3300dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
33013660e330SKris Buschelman   SSE_SCOPE_END;
33023660e330SKris Buschelman   PetscFunctionReturn(0);
33033660e330SKris Buschelman }
33043660e330SKris Buschelman 
33057cf1b8d3SKris Buschelman #undef __FUNCT__
33067cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3307dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
33087cf1b8d3SKris Buschelman {
33097cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
33107cf1b8d3SKris Buschelman   int            *aj=a->j;
3311dfbe8321SBarry Smith   PetscErrorCode ierr;
3312dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
33137cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
33147cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
33157cf1b8d3SKris Buschelman 
33167cf1b8d3SKris Buschelman   PetscFunctionBegin;
33177cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
33187cf1b8d3SKris Buschelman   /*
33197cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
33207cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
33217cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
33227cf1b8d3SKris Buschelman   */
33237cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
33247cf1b8d3SKris Buschelman 
33251ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
33261ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
33277cf1b8d3SKris Buschelman   {
33287cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
33297cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
33307cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
33317cf1b8d3SKris Buschelman     int       jdx,idx;
33327cf1b8d3SKris Buschelman     int       *vi;
33337cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
33347cf1b8d3SKris Buschelman 
33357cf1b8d3SKris Buschelman     /* First block is the identity. */
33367cf1b8d3SKris Buschelman     idx  = 0;
33377cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
33387cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
33397cf1b8d3SKris Buschelman 
33407cf1b8d3SKris Buschelman     for (i=1; i<n;) {
33417cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
33427cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
33437cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
33447cf1b8d3SKris Buschelman       idx +=  4;
33457cf1b8d3SKris Buschelman 
33467cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
33477cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
33487cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
33497cf1b8d3SKris Buschelman 
33507cf1b8d3SKris Buschelman       while (nz--) {
33517cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
33527cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
33537cf1b8d3SKris Buschelman /*          jdx = *vi++; */
33547cf1b8d3SKris Buschelman 
33557cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
33567cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
33577cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
33587cf1b8d3SKris Buschelman 
33597cf1b8d3SKris Buschelman           /* First Column */
33607cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
33617cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
33627cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
33637cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
33647cf1b8d3SKris Buschelman 
33657cf1b8d3SKris Buschelman           /* Second Column */
33667cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
33677cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
33687cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
33697cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
33707cf1b8d3SKris Buschelman 
33717cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
33727cf1b8d3SKris Buschelman 
33737cf1b8d3SKris Buschelman           /* Third Column */
33747cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
33757cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
33767cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
33777cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
33787cf1b8d3SKris Buschelman 
33797cf1b8d3SKris Buschelman           /* Fourth Column */
33807cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
33817cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
33827cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
33837cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
33847cf1b8d3SKris Buschelman         SSE_INLINE_END_2
33857cf1b8d3SKris Buschelman 
33867cf1b8d3SKris Buschelman         v  += 16;
33877cf1b8d3SKris Buschelman       }
33887cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
33897cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
33907cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
33917cf1b8d3SKris Buschelman     }
33927cf1b8d3SKris Buschelman 
33937cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
33947cf1b8d3SKris Buschelman 
33957cf1b8d3SKris Buschelman     idt  = 4*(n-1);
33967cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
33977cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
33987cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
33997cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
34007cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
34017cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
34027cf1b8d3SKris Buschelman 
34037cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
34047cf1b8d3SKris Buschelman 
34057cf1b8d3SKris Buschelman       while (nz--) {
34067cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
34077cf1b8d3SKris Buschelman         idx = 4*(*vi++);
34087cf1b8d3SKris Buschelman /*          idx = *vi++; */
34097cf1b8d3SKris Buschelman 
34107cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
34117cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
34127cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
34137cf1b8d3SKris Buschelman 
34147cf1b8d3SKris Buschelman           /* First Column */
34157cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
34167cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
34177cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
34187cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
34197cf1b8d3SKris Buschelman 
34207cf1b8d3SKris Buschelman           /* Second Column */
34217cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
34227cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
34237cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
34247cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
34257cf1b8d3SKris Buschelman 
34267cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
34277cf1b8d3SKris Buschelman 
34287cf1b8d3SKris Buschelman           /* Third Column */
34297cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
34307cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
34317cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
34327cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
34337cf1b8d3SKris Buschelman 
34347cf1b8d3SKris Buschelman           /* Fourth Column */
34357cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
34367cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
34377cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
34387cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
34397cf1b8d3SKris Buschelman         SSE_INLINE_END_2
34407cf1b8d3SKris Buschelman         v  += 16;
34417cf1b8d3SKris Buschelman       }
34427cf1b8d3SKris Buschelman       v    = aa + ai16;
34437cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
34447cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
34457cf1b8d3SKris Buschelman       /*
34467cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
34477cf1b8d3SKris Buschelman          which was inverted as part of the factorization
34487cf1b8d3SKris Buschelman       */
34497cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
34507cf1b8d3SKris Buschelman         /* First Column */
34517cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
34527cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
34537cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
34547cf1b8d3SKris Buschelman 
34557cf1b8d3SKris Buschelman         /* Second Column */
34567cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
34577cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
34587cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
34597cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
34607cf1b8d3SKris Buschelman 
34617cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
34627cf1b8d3SKris Buschelman 
34637cf1b8d3SKris Buschelman         /* Third Column */
34647cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
34657cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
34667cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
34677cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
34687cf1b8d3SKris Buschelman 
34697cf1b8d3SKris Buschelman         /* Fourth Column */
34707cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
34717cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
34727cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
34737cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
34747cf1b8d3SKris Buschelman 
34757cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
34767cf1b8d3SKris Buschelman       SSE_INLINE_END_3
34777cf1b8d3SKris Buschelman 
34787cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
34797cf1b8d3SKris Buschelman       idt -= 4;
34807cf1b8d3SKris Buschelman     }
34817cf1b8d3SKris Buschelman 
34827cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
34837cf1b8d3SKris Buschelman     idt = 4*(n-1);
34847cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
34857cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
34867cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
34877cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
34887cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
34897cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
34907cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
34917cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
34927cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
34937cf1b8d3SKris Buschelman       idt -= 4;
34947cf1b8d3SKris Buschelman     }
34957cf1b8d3SKris Buschelman 
34967cf1b8d3SKris Buschelman   } /* End of artificial scope. */
34971ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
34981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3499dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
35007cf1b8d3SKris Buschelman   SSE_SCOPE_END;
35017cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
35027cf1b8d3SKris Buschelman }
35037cf1b8d3SKris Buschelman 
35043660e330SKris Buschelman #endif
35058f690400SShri Abhyankar 
35064a2ae208SSatish Balay #undef __FUNCT__
35074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3508dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
35094e2b4712SSatish Balay {
35104e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
35114e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
35126849ba73SBarry Smith   PetscErrorCode    ierr;
35135d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
35145d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3515d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3516d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3517d9fead3dSBarry Smith   const PetscScalar *b;
35184e2b4712SSatish Balay 
35194e2b4712SSatish Balay   PetscFunctionBegin;
3520d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3522f1af5d2fSBarry Smith   t  = a->solve_work;
35234e2b4712SSatish Balay 
35244e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
35254e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
35264e2b4712SSatish Balay 
35274e2b4712SSatish Balay   /* forward solve the lower triangular */
35284e2b4712SSatish Balay   idx    = 3*(*r++);
3529f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
35304e2b4712SSatish Balay   for (i=1; i<n; i++) {
35314e2b4712SSatish Balay     v     = aa + 9*ai[i];
35324e2b4712SSatish Balay     vi    = aj + ai[i];
35334e2b4712SSatish Balay     nz    = diag[i] - ai[i];
35344e2b4712SSatish Balay     idx   = 3*(*r++);
3535f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
35364e2b4712SSatish Balay     while (nz--) {
35374e2b4712SSatish Balay       idx   = 3*(*vi++);
3538f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3539f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3540f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3541f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
35424e2b4712SSatish Balay       v += 9;
35434e2b4712SSatish Balay     }
35444e2b4712SSatish Balay     idx = 3*i;
3545f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
35464e2b4712SSatish Balay   }
35474e2b4712SSatish Balay   /* backward solve the upper triangular */
35484e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
35494e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
35504e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
35514e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
35524e2b4712SSatish Balay     idt  = 3*i;
3553f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
35544e2b4712SSatish Balay     while (nz--) {
35554e2b4712SSatish Balay       idx   = 3*(*vi++);
3556f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3557f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3558f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3559f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
35604e2b4712SSatish Balay       v += 9;
35614e2b4712SSatish Balay     }
35624e2b4712SSatish Balay     idc = 3*(*c--);
35634e2b4712SSatish Balay     v   = aa + 9*diag[i];
3564f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3565f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3566f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
35674e2b4712SSatish Balay   }
35684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
35694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3570d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3572dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
35734e2b4712SSatish Balay   PetscFunctionReturn(0);
35744e2b4712SSatish Balay }
35754e2b4712SSatish Balay 
35760c4413a7SShri Abhyankar #undef __FUNCT__
3577a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
3578a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
35790c4413a7SShri Abhyankar {
35800c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
35810c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
35820c4413a7SShri Abhyankar   PetscErrorCode    ierr;
35830c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
35840c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
35850c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
35860c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
35870c4413a7SShri Abhyankar   const PetscScalar *b;
35880c4413a7SShri Abhyankar 
35890c4413a7SShri Abhyankar   PetscFunctionBegin;
35900c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35910c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
35920c4413a7SShri Abhyankar   t  = a->solve_work;
35930c4413a7SShri Abhyankar 
35940c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
35950c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
35960c4413a7SShri Abhyankar 
35970c4413a7SShri Abhyankar   /* forward solve the lower triangular */
35980c4413a7SShri Abhyankar   idx    = 3*r[0];
35990c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
36000c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
36010c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
36020c4413a7SShri Abhyankar     vi    = aj + ai[i];
36030c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
36040c4413a7SShri Abhyankar     idx   = 3*r[i];
36050c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
36060c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
36070c4413a7SShri Abhyankar       idx   = 3*vi[m];
36080c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
36090c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
36100c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
36110c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
36120c4413a7SShri Abhyankar       v += 9;
36130c4413a7SShri Abhyankar     }
36140c4413a7SShri Abhyankar     idx = 3*i;
36150c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
36160c4413a7SShri Abhyankar   }
36170c4413a7SShri Abhyankar   /* backward solve the upper triangular */
36180c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
36190c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
36200c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
36210c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
36220c4413a7SShri Abhyankar     idt  = 3*i;
36230c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
36240c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
36250c4413a7SShri Abhyankar       idx   = 3*vi[m];
36260c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
36270c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
36280c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
36290c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
36300c4413a7SShri Abhyankar       v += 9;
36310c4413a7SShri Abhyankar     }
36320c4413a7SShri Abhyankar     idc = 3*c[i];
36330c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
36340c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
36350c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
36360c4413a7SShri Abhyankar   }
36370c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
36380c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
36390c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
36400c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
36410c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
36420c4413a7SShri Abhyankar   PetscFunctionReturn(0);
36430c4413a7SShri Abhyankar }
36440c4413a7SShri Abhyankar 
364515091d37SBarry Smith /*
364615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
364715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
364815091d37SBarry Smith */
36494a2ae208SSatish Balay #undef __FUNCT__
36504a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3651dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
365215091d37SBarry Smith {
365315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3654690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3655dfbe8321SBarry Smith   PetscErrorCode    ierr;
3656690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3657d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3658d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3659d9fead3dSBarry Smith   const PetscScalar *b;
3660690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
366115091d37SBarry Smith 
366215091d37SBarry Smith   PetscFunctionBegin;
3663d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
36641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
366515091d37SBarry Smith 
366615091d37SBarry Smith   /* forward solve the lower triangular */
366715091d37SBarry Smith   idx    = 0;
366815091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
366915091d37SBarry Smith   for (i=1; i<n; i++) {
367015091d37SBarry Smith     v     =  aa      + 9*ai[i];
367115091d37SBarry Smith     vi    =  aj      + ai[i];
367215091d37SBarry Smith     nz    =  diag[i] - ai[i];
367315091d37SBarry Smith     idx   +=  3;
3674f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
367515091d37SBarry Smith     while (nz--) {
367615091d37SBarry Smith       jdx   = 3*(*vi++);
367715091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3678f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3679f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3680f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
368115091d37SBarry Smith       v    += 9;
368215091d37SBarry Smith     }
3683f1af5d2fSBarry Smith     x[idx]   = s1;
3684f1af5d2fSBarry Smith     x[1+idx] = s2;
3685f1af5d2fSBarry Smith     x[2+idx] = s3;
368615091d37SBarry Smith   }
368715091d37SBarry Smith   /* backward solve the upper triangular */
368815091d37SBarry Smith   for (i=n-1; i>=0; i--){
368915091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
369015091d37SBarry Smith     vi   = aj + diag[i] + 1;
369115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
369215091d37SBarry Smith     idt  = 3*i;
3693f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3694f1af5d2fSBarry Smith     s3 = x[2+idt];
369515091d37SBarry Smith     while (nz--) {
369615091d37SBarry Smith       idx   = 3*(*vi++);
369715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3698f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3699f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3700f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
370115091d37SBarry Smith       v    += 9;
370215091d37SBarry Smith     }
370315091d37SBarry Smith     v        = aa +  9*diag[i];
3704f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3705f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3706f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
370715091d37SBarry Smith   }
370815091d37SBarry Smith 
3709d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
37101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3711dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
371215091d37SBarry Smith   PetscFunctionReturn(0);
371315091d37SBarry Smith }
371415091d37SBarry Smith 
3715cee9d6f2SShri Abhyankar #undef __FUNCT__
3716a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3717a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3718b2b2dd24SShri Abhyankar {
3719b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3720b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3721b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3722b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3723b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3724b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3725b2b2dd24SShri Abhyankar     PetscScalar       *x;
3726b2b2dd24SShri Abhyankar     const PetscScalar *b;
3727b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
3728b2b2dd24SShri Abhyankar 
3729b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3730b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3731b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3732b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3733b2b2dd24SShri Abhyankar     idx    = 0;
3734b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3735b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3736b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3737b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3738b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3739b2b2dd24SShri Abhyankar       idx   = bs*i;
3740b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3741b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
3742b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
3743b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3744b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3745b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3746b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3747b2b2dd24SShri Abhyankar 
3748b2b2dd24SShri Abhyankar           v   +=  bs2;
3749b2b2dd24SShri Abhyankar         }
3750b2b2dd24SShri Abhyankar 
3751b2b2dd24SShri Abhyankar        x[idx]   = s1;
3752b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3753b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3754b2b2dd24SShri Abhyankar     }
3755b2b2dd24SShri Abhyankar 
3756b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3757b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3758b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3759b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3760b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3761b2b2dd24SShri Abhyankar      idt = bs*i;
3762b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3763b2b2dd24SShri Abhyankar 
3764b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
3765b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
3766b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3767b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3768b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3769b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3770b2b2dd24SShri Abhyankar 
3771b2b2dd24SShri Abhyankar         v   +=  bs2;
3772b2b2dd24SShri Abhyankar     }
3773b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3774b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3775b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3776b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3777b2b2dd24SShri Abhyankar 
3778b2b2dd24SShri Abhyankar   }
3779b2b2dd24SShri Abhyankar 
3780b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3781b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3782b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3783b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3784b2b2dd24SShri Abhyankar }
3785b2b2dd24SShri Abhyankar 
3786b2b2dd24SShri Abhyankar #undef __FUNCT__
37874a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3788dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
37894e2b4712SSatish Balay {
37904e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
37914e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
37926849ba73SBarry Smith   PetscErrorCode    ierr;
37935d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
37945d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3795d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3796d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
3797d9fead3dSBarry Smith   const PetscScalar *b;
37984e2b4712SSatish Balay 
37994e2b4712SSatish Balay   PetscFunctionBegin;
3800d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38011ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3802f1af5d2fSBarry Smith   t  = a->solve_work;
38034e2b4712SSatish Balay 
38044e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
38054e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
38064e2b4712SSatish Balay 
38074e2b4712SSatish Balay   /* forward solve the lower triangular */
38084e2b4712SSatish Balay   idx    = 2*(*r++);
3809f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
38104e2b4712SSatish Balay   for (i=1; i<n; i++) {
38114e2b4712SSatish Balay     v     = aa + 4*ai[i];
38124e2b4712SSatish Balay     vi    = aj + ai[i];
38134e2b4712SSatish Balay     nz    = diag[i] - ai[i];
38144e2b4712SSatish Balay     idx   = 2*(*r++);
3815f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
38164e2b4712SSatish Balay     while (nz--) {
38174e2b4712SSatish Balay       idx   = 2*(*vi++);
3818f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3819f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3820f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
38214e2b4712SSatish Balay       v += 4;
38224e2b4712SSatish Balay     }
38234e2b4712SSatish Balay     idx = 2*i;
3824f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
38254e2b4712SSatish Balay   }
38264e2b4712SSatish Balay   /* backward solve the upper triangular */
38274e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
38284e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
38294e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
38304e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
38314e2b4712SSatish Balay     idt  = 2*i;
3832f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
38334e2b4712SSatish Balay     while (nz--) {
38344e2b4712SSatish Balay       idx   = 2*(*vi++);
3835f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3836f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3837f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
38384e2b4712SSatish Balay       v += 4;
38394e2b4712SSatish Balay     }
38404e2b4712SSatish Balay     idc = 2*(*c--);
38414e2b4712SSatish Balay     v   = aa + 4*diag[i];
3842f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3843f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
38444e2b4712SSatish Balay   }
38454e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
38464e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3847d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38481ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3849dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
38504e2b4712SSatish Balay   PetscFunctionReturn(0);
38514e2b4712SSatish Balay }
38524e2b4712SSatish Balay 
38530c4413a7SShri Abhyankar #undef __FUNCT__
3854a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
3855a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
38560c4413a7SShri Abhyankar {
38570c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
38580c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
38590c4413a7SShri Abhyankar   PetscErrorCode    ierr;
38600c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
38610c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
38620c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
38630c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
38640c4413a7SShri Abhyankar   const PetscScalar *b;
38650c4413a7SShri Abhyankar 
38660c4413a7SShri Abhyankar   PetscFunctionBegin;
38670c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38680c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
38690c4413a7SShri Abhyankar   t  = a->solve_work;
38700c4413a7SShri Abhyankar 
38710c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
38720c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
38730c4413a7SShri Abhyankar 
38740c4413a7SShri Abhyankar   /* forward solve the lower triangular */
38750c4413a7SShri Abhyankar   idx    = 2*r[0];
38760c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
38770c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
38780c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
38790c4413a7SShri Abhyankar     vi    = aj + ai[i];
38800c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
38810c4413a7SShri Abhyankar     idx   = 2*r[i];
38820c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
38830c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
38840c4413a7SShri Abhyankar       jdx   = 2*vi[m];
38850c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
38860c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
38870c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
38880c4413a7SShri Abhyankar       v += 4;
38890c4413a7SShri Abhyankar     }
38900c4413a7SShri Abhyankar     idx = 2*i;
38910c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
38920c4413a7SShri Abhyankar   }
38930c4413a7SShri Abhyankar   /* backward solve the upper triangular */
38940c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
38950c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
38960c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
38970c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
38980c4413a7SShri Abhyankar     idt  = 2*i;
38990c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
39000c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
39010c4413a7SShri Abhyankar       idx   = 2*vi[m];
39020c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
39030c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
39040c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
39050c4413a7SShri Abhyankar       v += 4;
39060c4413a7SShri Abhyankar     }
39070c4413a7SShri Abhyankar     idc = 2*c[i];
39080c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
39090c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
39100c4413a7SShri Abhyankar   }
39110c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
39120c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
39130c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39140c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
39150c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
39160c4413a7SShri Abhyankar   PetscFunctionReturn(0);
39170c4413a7SShri Abhyankar }
39188f690400SShri Abhyankar 
391915091d37SBarry Smith /*
392015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
392115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
392215091d37SBarry Smith */
39234a2ae208SSatish Balay #undef __FUNCT__
39244a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3925dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
392615091d37SBarry Smith {
392715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3928690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3929dfbe8321SBarry Smith   PetscErrorCode    ierr;
3930690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3931d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3932d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
3933d9fead3dSBarry Smith   const PetscScalar *b;
3934690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
393515091d37SBarry Smith 
393615091d37SBarry Smith   PetscFunctionBegin;
3937d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39381ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
393915091d37SBarry Smith 
394015091d37SBarry Smith   /* forward solve the lower triangular */
394115091d37SBarry Smith   idx    = 0;
394215091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
394315091d37SBarry Smith   for (i=1; i<n; i++) {
394415091d37SBarry Smith     v     =  aa      + 4*ai[i];
394515091d37SBarry Smith     vi    =  aj      + ai[i];
394615091d37SBarry Smith     nz    =  diag[i] - ai[i];
394715091d37SBarry Smith     idx   +=  2;
3948f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
394915091d37SBarry Smith     while (nz--) {
395015091d37SBarry Smith       jdx   = 2*(*vi++);
395115091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
3952f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3953f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
395415091d37SBarry Smith       v    += 4;
395515091d37SBarry Smith     }
3956f1af5d2fSBarry Smith     x[idx]   = s1;
3957f1af5d2fSBarry Smith     x[1+idx] = s2;
395815091d37SBarry Smith   }
395915091d37SBarry Smith   /* backward solve the upper triangular */
396015091d37SBarry Smith   for (i=n-1; i>=0; i--){
396115091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
396215091d37SBarry Smith     vi   = aj + diag[i] + 1;
396315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
396415091d37SBarry Smith     idt  = 2*i;
3965f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
396615091d37SBarry Smith     while (nz--) {
396715091d37SBarry Smith       idx   = 2*(*vi++);
396815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
3969f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3970f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
397115091d37SBarry Smith       v    += 4;
397215091d37SBarry Smith     }
397315091d37SBarry Smith     v        = aa +  4*diag[i];
3974f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
3975f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
397615091d37SBarry Smith   }
397715091d37SBarry Smith 
3978d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3980dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
398115091d37SBarry Smith   PetscFunctionReturn(0);
398215091d37SBarry Smith }
398315091d37SBarry Smith 
3984cee9d6f2SShri Abhyankar #undef __FUNCT__
3985a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
3986a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3987b2b2dd24SShri Abhyankar {
3988b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3989b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
3990b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3991b2b2dd24SShri Abhyankar     PetscInt          jdx;
3992b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3993b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
3994b2b2dd24SShri Abhyankar     const PetscScalar *b;
3995b2b2dd24SShri Abhyankar 
3996b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3997b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3998b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3999b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4000b2b2dd24SShri Abhyankar     idx    = 0;
4001b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4002b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4003b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4004b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4005b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4006b2b2dd24SShri Abhyankar        idx  = 2*i;
4007b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4008b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4009b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4010b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4011b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4012b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4013b2b2dd24SShri Abhyankar            v   +=  4;
4014b2b2dd24SShri Abhyankar         }
4015b2b2dd24SShri Abhyankar        x[idx]   = s1;
4016b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4017b2b2dd24SShri Abhyankar     }
4018b2b2dd24SShri Abhyankar 
4019b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4020b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4021b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4022b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4023b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4024b2b2dd24SShri Abhyankar      idt = 2*i;
4025b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4026b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4027b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4028b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4029b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4030b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4031b2b2dd24SShri Abhyankar          v    += 4;
4032b2b2dd24SShri Abhyankar     }
4033b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4034b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4035b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4036b2b2dd24SShri Abhyankar   }
4037b2b2dd24SShri Abhyankar 
4038b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4039b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4040b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4041b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4042b2b2dd24SShri Abhyankar }
4043b2b2dd24SShri Abhyankar 
4044b2b2dd24SShri Abhyankar #undef __FUNCT__
40454a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4046dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
40474e2b4712SSatish Balay {
40484e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
40494e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
40506849ba73SBarry Smith   PetscErrorCode ierr;
40515d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
40525d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
40533f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
405487828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
40554e2b4712SSatish Balay 
40564e2b4712SSatish Balay   PetscFunctionBegin;
40574e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
40584e2b4712SSatish Balay 
40591ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
40601ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4061f1af5d2fSBarry Smith   t  = a->solve_work;
40624e2b4712SSatish Balay 
40634e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
40644e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
40654e2b4712SSatish Balay 
40664e2b4712SSatish Balay   /* forward solve the lower triangular */
4067f1af5d2fSBarry Smith   t[0] = b[*r++];
40684e2b4712SSatish Balay   for (i=1; i<n; i++) {
40694e2b4712SSatish Balay     v     = aa + ai[i];
40704e2b4712SSatish Balay     vi    = aj + ai[i];
40714e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4072f1af5d2fSBarry Smith     s1  = b[*r++];
40734e2b4712SSatish Balay     while (nz--) {
4074f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
40754e2b4712SSatish Balay     }
4076f1af5d2fSBarry Smith     t[i] = s1;
40774e2b4712SSatish Balay   }
40784e2b4712SSatish Balay   /* backward solve the upper triangular */
40794e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
40804e2b4712SSatish Balay     v    = aa + diag[i] + 1;
40814e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
40824e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4083f1af5d2fSBarry Smith     s1 = t[i];
40844e2b4712SSatish Balay     while (nz--) {
4085f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
40864e2b4712SSatish Balay     }
4087f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
40884e2b4712SSatish Balay   }
40894e2b4712SSatish Balay 
40904e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
40914e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40921ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
40931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4094dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
40954e2b4712SSatish Balay   PetscFunctionReturn(0);
40964e2b4712SSatish Balay }
409715091d37SBarry Smith /*
409815091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
409915091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
410015091d37SBarry Smith */
41014a2ae208SSatish Balay #undef __FUNCT__
41024a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4103dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
410415091d37SBarry Smith {
410515091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4106690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4107dfbe8321SBarry Smith   PetscErrorCode ierr;
4108690b6cddSBarry Smith   PetscInt       *diag = a->diag;
410915091d37SBarry Smith   MatScalar      *aa=a->a;
411087828ca2SBarry Smith   PetscScalar    *x,*b;
411187828ca2SBarry Smith   PetscScalar    s1,x1;
411215091d37SBarry Smith   MatScalar      *v;
4113690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
411415091d37SBarry Smith 
411515091d37SBarry Smith   PetscFunctionBegin;
41161ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
41171ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
411815091d37SBarry Smith 
411915091d37SBarry Smith   /* forward solve the lower triangular */
412015091d37SBarry Smith   idx    = 0;
412115091d37SBarry Smith   x[0]   = b[0];
412215091d37SBarry Smith   for (i=1; i<n; i++) {
412315091d37SBarry Smith     v     =  aa      + ai[i];
412415091d37SBarry Smith     vi    =  aj      + ai[i];
412515091d37SBarry Smith     nz    =  diag[i] - ai[i];
412615091d37SBarry Smith     idx   +=  1;
4127f1af5d2fSBarry Smith     s1  =  b[idx];
412815091d37SBarry Smith     while (nz--) {
412915091d37SBarry Smith       jdx   = *vi++;
413015091d37SBarry Smith       x1    = x[jdx];
4131f1af5d2fSBarry Smith       s1 -= v[0]*x1;
413215091d37SBarry Smith       v    += 1;
413315091d37SBarry Smith     }
4134f1af5d2fSBarry Smith     x[idx]   = s1;
413515091d37SBarry Smith   }
413615091d37SBarry Smith   /* backward solve the upper triangular */
413715091d37SBarry Smith   for (i=n-1; i>=0; i--){
413815091d37SBarry Smith     v    = aa + diag[i] + 1;
413915091d37SBarry Smith     vi   = aj + diag[i] + 1;
414015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
414115091d37SBarry Smith     idt  = i;
4142f1af5d2fSBarry Smith     s1 = x[idt];
414315091d37SBarry Smith     while (nz--) {
414415091d37SBarry Smith       idx   = *vi++;
414515091d37SBarry Smith       x1    = x[idx];
4146f1af5d2fSBarry Smith       s1 -= v[0]*x1;
414715091d37SBarry Smith       v    += 1;
414815091d37SBarry Smith     }
414915091d37SBarry Smith     v        = aa +  diag[i];
4150f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
415115091d37SBarry Smith   }
41521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
41531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4154dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
415515091d37SBarry Smith   PetscFunctionReturn(0);
415615091d37SBarry Smith }
41574e2b4712SSatish Balay 
41584e2b4712SSatish Balay /* ----------------------------------------------------------------*/
415916a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
41606bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4161ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
41626bce7ff8SHong Zhang 
41636bce7ff8SHong Zhang #undef __FUNCT__
41646bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
41656bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
41666bce7ff8SHong Zhang {
41676bce7ff8SHong Zhang   Mat            C=B;
41686bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
41696bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
41706bce7ff8SHong Zhang   PetscErrorCode ierr;
41716bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
41726bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
41736bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4174b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4175914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4176914a18a2SHong Zhang   MatScalar      *v_work;
4177ae3d28f0SHong Zhang   PetscTruth     col_identity,row_identity,both_identity;
41786bce7ff8SHong Zhang 
41796bce7ff8SHong Zhang   PetscFunctionBegin;
41806bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
41816bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4182ae3d28f0SHong Zhang 
4183fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4184fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
41856bce7ff8SHong Zhang   ics  = ic;
41866bce7ff8SHong Zhang 
4187914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
4188fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
4189914a18a2SHong Zhang 
41906bce7ff8SHong Zhang   for (i=0; i<n; i++){
41916bce7ff8SHong Zhang     /* zero rtmp */
41926bce7ff8SHong Zhang     /* L part */
41936bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
41946bce7ff8SHong Zhang     bjtmp = bj + bi[i];
4195914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4196914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4197914a18a2SHong Zhang     }
41986bce7ff8SHong Zhang 
41996bce7ff8SHong Zhang     /* U part */
42001a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
42011a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
42021a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
42031a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
42041a83e813SShri Abhyankar     }
42051a83e813SShri Abhyankar 
42061a83e813SShri Abhyankar     /* load in initial (unfactored row) */
42071a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
42081a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
42091a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
42101a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
42111a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
42121a83e813SShri Abhyankar     }
42131a83e813SShri Abhyankar 
42141a83e813SShri Abhyankar     /* elimination */
42151a83e813SShri Abhyankar     bjtmp = bj + bi[i];
42161a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
42171a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
42181a83e813SShri Abhyankar       row = bjtmp[k];
42191a83e813SShri Abhyankar       pc = rtmp + bs2*row;
42201a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
42211a83e813SShri Abhyankar       if (flg) {
42221a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
42231a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
42241a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
42251a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
42261a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
42271a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
42281a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
42291a83e813SShri Abhyankar         }
42301a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
42311a83e813SShri Abhyankar       }
42321a83e813SShri Abhyankar     }
42331a83e813SShri Abhyankar 
42341a83e813SShri Abhyankar     /* finished row so stick it into b->a */
42351a83e813SShri Abhyankar     /* L part */
42361a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
42371a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
42381a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
42391a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
42401a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
42411a83e813SShri Abhyankar     }
42421a83e813SShri Abhyankar 
42431a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
42441a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
42451a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
42461a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
42471a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
42481a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
42491a83e813SShri Abhyankar 
42501a83e813SShri Abhyankar     /* U part */
42511a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
42521a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
42531a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
42541a83e813SShri Abhyankar     for (j=0; j<nz; j++){
42551a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
42561a83e813SShri Abhyankar     }
42571a83e813SShri Abhyankar   }
42581a83e813SShri Abhyankar 
42591a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4260fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
42611a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
42621a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
42631a83e813SShri Abhyankar 
4264ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4265ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
4266ae3d28f0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
4267ae3d28f0SHong Zhang   if (both_identity){
4268a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
4269ae3d28f0SHong Zhang   } else {
4270a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
4271ae3d28f0SHong Zhang   }
4272ae3d28f0SHong Zhang 
42731a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
42741a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
42751a83e813SShri Abhyankar   PetscFunctionReturn(0);
42761a83e813SShri Abhyankar }
42771a83e813SShri Abhyankar 
42786bce7ff8SHong Zhang /*
42796bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
428016a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
428116a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
42826bce7ff8SHong Zhang */
4283c0c7eb62SShri Abhyankar 
42846bce7ff8SHong Zhang #undef __FUNCT__
42856bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
42866bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
42876bce7ff8SHong Zhang {
42886bce7ff8SHong Zhang 
42896bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
42906bce7ff8SHong Zhang   PetscErrorCode     ierr;
429116a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
429235aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
429335aa4fcfSShri Abhyankar 
429435aa4fcfSShri Abhyankar   PetscFunctionBegin;
429535aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
429635aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
429735aa4fcfSShri Abhyankar 
429835aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
429935aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
430035aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
430135aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
430235aa4fcfSShri Abhyankar   if (!b->diag){
430335aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
430435aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
430535aa4fcfSShri Abhyankar   }
430635aa4fcfSShri Abhyankar   bdiag = b->diag;
430735aa4fcfSShri Abhyankar 
430835aa4fcfSShri Abhyankar   if (n > 0) {
430935aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
431035aa4fcfSShri Abhyankar   }
431135aa4fcfSShri Abhyankar 
431235aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
431335aa4fcfSShri Abhyankar   bi = b->i;
431435aa4fcfSShri Abhyankar   bj = b->j;
431535aa4fcfSShri Abhyankar 
431635aa4fcfSShri Abhyankar   /* L part */
431735aa4fcfSShri Abhyankar   bi[0] = 0;
431835aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
431935aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
432035aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
432135aa4fcfSShri Abhyankar     aj = a->j + ai[i];
432235aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
432335aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
432435aa4fcfSShri Abhyankar     }
432535aa4fcfSShri Abhyankar   }
432635aa4fcfSShri Abhyankar 
432735aa4fcfSShri Abhyankar   /* U part */
432835aa4fcfSShri Abhyankar   bi_temp = bi[n];
432935aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
433035aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
433135aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
433235aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
433335aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
433435aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
433535aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
433635aa4fcfSShri Abhyankar     }
433735aa4fcfSShri Abhyankar     /* diag[i] */
433835aa4fcfSShri Abhyankar     *bj = i; bj++;
433935aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
434035aa4fcfSShri Abhyankar   }
434135aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
434235aa4fcfSShri Abhyankar }
434335aa4fcfSShri Abhyankar 
434435aa4fcfSShri Abhyankar #undef __FUNCT__
434516a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
434616a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
434716a2bf60SHong Zhang {
434816a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
434916a2bf60SHong Zhang   IS                 isicol;
435016a2bf60SHong Zhang   PetscErrorCode     ierr;
435116a2bf60SHong Zhang   const PetscInt     *r,*ic;
43527fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
435316a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
435416a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
435516a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
43567fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
435716a2bf60SHong Zhang   PetscReal          f;
435816a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
435916a2bf60SHong Zhang   PetscBT            lnkbt;
436016a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
436116a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
436216a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
436316a2bf60SHong Zhang   PetscTruth         missing;
43647fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
436516a2bf60SHong Zhang 
436616a2bf60SHong Zhang   PetscFunctionBegin;
436716a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
436816a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
436916a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
437016a2bf60SHong Zhang 
437116a2bf60SHong Zhang   f             = info->fill;
437216a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
437316a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
437416a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
437516a2bf60SHong Zhang 
437616a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
437716a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
43787fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
437916a2bf60SHong Zhang 
43807fa3a6a0SHong Zhang   if (!levels && both_identity) {
438116a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
438216a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4383ae3d28f0SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
438435aa4fcfSShri Abhyankar 
438535aa4fcfSShri Abhyankar     fact->factor = MAT_FACTOR_ILU;
438635aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
438735aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
438835aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
438935aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
439035aa4fcfSShri Abhyankar     b->row           = isrow;
439135aa4fcfSShri Abhyankar     b->col           = iscol;
439235aa4fcfSShri Abhyankar     b->icol          = isicol;
439335aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
439435aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
439535aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
439635aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
439735aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
439835aa4fcfSShri Abhyankar   }
439935aa4fcfSShri Abhyankar 
440035aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
440135aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
440235aa4fcfSShri Abhyankar 
440335aa4fcfSShri Abhyankar   /* get new row pointers */
440435aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
440535aa4fcfSShri Abhyankar   bi[0] = 0;
440635aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
440735aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
440835aa4fcfSShri Abhyankar   bdiag[0]  = 0;
440935aa4fcfSShri Abhyankar 
4410fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
441135aa4fcfSShri Abhyankar 
441235aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
441335aa4fcfSShri Abhyankar   nlnk = n + 1;
441435aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
441535aa4fcfSShri Abhyankar 
441635aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
441735aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
441835aa4fcfSShri Abhyankar   current_space = free_space;
441935aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
442035aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
442135aa4fcfSShri Abhyankar 
442235aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
442335aa4fcfSShri Abhyankar     nzi = 0;
442435aa4fcfSShri Abhyankar     /* copy current row into linked list */
442535aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
442635aa4fcfSShri Abhyankar     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
442735aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
442835aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
442935aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
443035aa4fcfSShri Abhyankar     nzi += nlnk;
443135aa4fcfSShri Abhyankar 
443235aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
443335aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
443435aa4fcfSShri Abhyankar       fm = n;
443535aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
443635aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
443735aa4fcfSShri Abhyankar       lnk[fm]    = i;
443835aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
443935aa4fcfSShri Abhyankar       nzi++; dcount++;
444035aa4fcfSShri Abhyankar     }
444135aa4fcfSShri Abhyankar 
444235aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
444335aa4fcfSShri Abhyankar     nzbd = 0;
444435aa4fcfSShri Abhyankar     prow = lnk[n];
444535aa4fcfSShri Abhyankar     while (prow < i) {
444635aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
444735aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
444835aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
444935aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
445035aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
445135aa4fcfSShri Abhyankar       nzi += nlnk;
445235aa4fcfSShri Abhyankar       prow = lnk[prow];
445335aa4fcfSShri Abhyankar       nzbd++;
445435aa4fcfSShri Abhyankar     }
445535aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
445635aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
445735aa4fcfSShri Abhyankar 
445835aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
445935aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
446035aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
446135aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
446235aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
446335aa4fcfSShri Abhyankar       reallocs++;
446435aa4fcfSShri Abhyankar     }
446535aa4fcfSShri Abhyankar 
446635aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
446735aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
446835aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
446935aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
447035aa4fcfSShri Abhyankar 
447135aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
447235aa4fcfSShri Abhyankar     if (*(bj_ptr[i]+bdiag[i]) != i) {
447335aa4fcfSShri Abhyankar       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
447435aa4fcfSShri Abhyankar     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
447535aa4fcfSShri Abhyankar     }
447635aa4fcfSShri Abhyankar 
447735aa4fcfSShri Abhyankar     current_space->array           += nzi;
447835aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
447935aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
448035aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
448135aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
448235aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
448335aa4fcfSShri Abhyankar   }
448435aa4fcfSShri Abhyankar 
448535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
448635aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
448735aa4fcfSShri Abhyankar 
448835aa4fcfSShri Abhyankar   /* destroy list of free space and other temporary arrays */
448935aa4fcfSShri Abhyankar   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
449035aa4fcfSShri Abhyankar 
449135aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
449235aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
449335aa4fcfSShri Abhyankar 
449435aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
449535aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
4496fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
449735aa4fcfSShri Abhyankar 
449835aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
449935aa4fcfSShri Abhyankar   {
450035aa4fcfSShri Abhyankar     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
450135aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
450235aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
450335aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
450435aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
450535aa4fcfSShri Abhyankar     if (diagonal_fill) {
450635aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
450735aa4fcfSShri Abhyankar     }
450835aa4fcfSShri Abhyankar   }
450935aa4fcfSShri Abhyankar #endif
451035aa4fcfSShri Abhyankar 
451135aa4fcfSShri Abhyankar   /* put together the new matrix */
451235aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
451335aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
451435aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
451535aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
451635aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
451735aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
451835aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
451935aa4fcfSShri Abhyankar   b->j          = bj;
452035aa4fcfSShri Abhyankar   b->i          = bi;
452135aa4fcfSShri Abhyankar   b->diag       = bdiag;
452235aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
452335aa4fcfSShri Abhyankar   b->ilen       = 0;
452435aa4fcfSShri Abhyankar   b->imax       = 0;
452535aa4fcfSShri Abhyankar   b->row        = isrow;
452635aa4fcfSShri Abhyankar   b->col        = iscol;
452735aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
452835aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
452935aa4fcfSShri Abhyankar   b->icol       = isicol;
453035aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
453135aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
453235aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
453335aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
453435aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
4535ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
4536ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
4537ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
4538ae3d28f0SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
453935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
454035aa4fcfSShri Abhyankar }
454135aa4fcfSShri Abhyankar 
454235aa4fcfSShri Abhyankar 
45434e2b4712SSatish Balay /*
45444e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
45454e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
45464e2b4712SSatish Balay    Not a good example of code reuse.
45474e2b4712SSatish Balay */
45484a2ae208SSatish Balay #undef __FUNCT__
45494a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
45500481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
45514e2b4712SSatish Balay {
45524e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
45534e2b4712SSatish Balay   IS             isicol;
45546849ba73SBarry Smith   PetscErrorCode ierr;
45555d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
45565d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4557a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4558d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
455941df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
4560329f5518SBarry Smith   PetscReal      f;
4561c0c7eb62SShri Abhyankar   PetscTruth     newdatastruct = PETSC_FALSE;
45624e2b4712SSatish Balay 
45634e2b4712SSatish Balay   PetscFunctionBegin;
456416a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
456516a2bf60SHong Zhang   if (newdatastruct){
456616a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
456716a2bf60SHong Zhang     PetscFunctionReturn(0);
456816a2bf60SHong Zhang   }
456916a2bf60SHong Zhang 
45706bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
45716bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
45726bce7ff8SHong Zhang 
4573435faa5fSBarry Smith   f             = info->fill;
4574690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
4575690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
45764c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
457716a2bf60SHong Zhang 
4578667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4579667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
45807d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
4581309c388cSBarry Smith 
458241df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
458316a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
45846bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
45856bce7ff8SHong Zhang 
4586719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
4587ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
4588bb3d539aSBarry Smith     b->row       = isrow;
4589bb3d539aSBarry Smith     b->col       = iscol;
4590bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4591bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4592bb3d539aSBarry Smith     b->icol      = isicol;
4593bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4594b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
45956bce7ff8SHong Zhang     PetscFunctionReturn(0);
45966bce7ff8SHong Zhang   }
45976bce7ff8SHong Zhang 
45986bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
45994e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
46004e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
46014e2b4712SSatish Balay 
46024e2b4712SSatish Balay     /* get new row pointers */
4603690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
46044e2b4712SSatish Balay     ainew[0] = 0;
46054e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
4606690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
4607690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
46084e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
4609690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
46104e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
4611690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
46124e2b4712SSatish Balay     /* im is level for each filled value */
4613690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
46144e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
4615690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
46164e2b4712SSatish Balay     dloc[0]  = 0;
46174e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
4618435faa5fSBarry Smith 
4619435faa5fSBarry Smith       /* copy prow into linked list */
46204e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
46213b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
46224e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
46234e2b4712SSatish Balay       fill[n]    = n;
4624435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
46254e2b4712SSatish Balay       while (nz--) {
46264e2b4712SSatish Balay 	fm  = n;
46274e2b4712SSatish Balay 	idx = ic[*xi++];
46284e2b4712SSatish Balay 	do {
46294e2b4712SSatish Balay 	  m  = fm;
46304e2b4712SSatish Balay 	  fm = fill[m];
46314e2b4712SSatish Balay 	} while (fm < idx);
46324e2b4712SSatish Balay 	fill[m]   = idx;
46334e2b4712SSatish Balay 	fill[idx] = fm;
46344e2b4712SSatish Balay 	im[idx]   = 0;
46354e2b4712SSatish Balay       }
4636435faa5fSBarry Smith 
4637435faa5fSBarry Smith       /* make sure diagonal entry is included */
4638435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
4639435faa5fSBarry Smith 	fm = n;
4640435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
4641435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
4642435faa5fSBarry Smith 	fill[fm]   = prow;
4643435faa5fSBarry Smith 	im[prow]   = 0;
4644435faa5fSBarry Smith 	nzf++;
4645335d9088SBarry Smith 	dcount++;
4646435faa5fSBarry Smith       }
4647435faa5fSBarry Smith 
46484e2b4712SSatish Balay       nzi = 0;
46494e2b4712SSatish Balay       row = fill[n];
46504e2b4712SSatish Balay       while (row < prow) {
46514e2b4712SSatish Balay 	incrlev = im[row] + 1;
46524e2b4712SSatish Balay 	nz      = dloc[row];
4653435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
46544e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
46554e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
46564e2b4712SSatish Balay 	fm      = row;
46574e2b4712SSatish Balay 	while (nnz-- > 0) {
46584e2b4712SSatish Balay 	  idx = *xi++;
46594e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
46604e2b4712SSatish Balay 	    flev++;
46614e2b4712SSatish Balay 	    continue;
46624e2b4712SSatish Balay 	  }
46634e2b4712SSatish Balay 	  do {
46644e2b4712SSatish Balay 	    m  = fm;
46654e2b4712SSatish Balay 	    fm = fill[m];
46664e2b4712SSatish Balay 	  } while (fm < idx);
46674e2b4712SSatish Balay 	  if (fm != idx) {
46684e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
46694e2b4712SSatish Balay 	    fill[m]   = idx;
46704e2b4712SSatish Balay 	    fill[idx] = fm;
46714e2b4712SSatish Balay 	    fm        = idx;
46724e2b4712SSatish Balay 	    nzf++;
4673ecf371e4SBarry Smith 	  } else {
46744e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
46754e2b4712SSatish Balay 	  }
46764e2b4712SSatish Balay 	  flev++;
46774e2b4712SSatish Balay 	}
46784e2b4712SSatish Balay 	row = fill[row];
46794e2b4712SSatish Balay 	nzi++;
46804e2b4712SSatish Balay       }
46814e2b4712SSatish Balay       /* copy new filled row into permanent storage */
46824e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
46834e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
4684ecf371e4SBarry Smith 
4685ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
4686ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
4687ecf371e4SBarry Smith 	/* just double the memory each time */
4688690b6cddSBarry Smith 	PetscInt maxadd = jmax;
4689ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
46904e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
46914e2b4712SSatish Balay 	jmax += maxadd;
4692ecf371e4SBarry Smith 
4693ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
46945d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
46955d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4696606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
46975d0c19d7SBarry Smith 	ajnew = xitmp;
46985d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
46995d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
4700606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
47015d0c19d7SBarry Smith 	ajfill = xitmp;
4702eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
47034e2b4712SSatish Balay       }
47045d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
47054e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
47064e2b4712SSatish Balay       dloc[prow]  = nzi;
47074e2b4712SSatish Balay       fm          = fill[n];
47084e2b4712SSatish Balay       while (nzf--) {
47095d0c19d7SBarry Smith 	*xitmp++ = fm;
47104e2b4712SSatish Balay 	*flev++ = im[fm];
47114e2b4712SSatish Balay 	fm      = fill[fm];
47124e2b4712SSatish Balay       }
4713435faa5fSBarry Smith       /* make sure row has diagonal entry */
4714435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
471577431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
47162401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
4717435faa5fSBarry Smith       }
47184e2b4712SSatish Balay     }
4719606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
47204e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
47214e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
4722606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
4723606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
47244e2b4712SSatish Balay 
47256cf91177SBarry Smith #if defined(PETSC_USE_INFO)
47264e2b4712SSatish Balay     {
4727329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
4728ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
4729ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
4730ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
4731ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
4732335d9088SBarry Smith       if (diagonal_fill) {
4733ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
4734335d9088SBarry Smith       }
47354e2b4712SSatish Balay     }
473663ba0a88SBarry Smith #endif
47374e2b4712SSatish Balay 
47384e2b4712SSatish Balay     /* put together the new matrix */
4739719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
4740719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
4741ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
4742e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
4743e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
47447c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
4745a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
47464e2b4712SSatish Balay     b->j          = ajnew;
47474e2b4712SSatish Balay     b->i          = ainew;
47484e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
47494e2b4712SSatish Balay     b->diag       = dloc;
47507f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
47514e2b4712SSatish Balay     b->ilen       = 0;
47524e2b4712SSatish Balay     b->imax       = 0;
47534e2b4712SSatish Balay     b->row        = isrow;
47544e2b4712SSatish Balay     b->col        = iscol;
4755bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
4756c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
4757c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
4758e51c0b9cSSatish Balay     b->icol       = isicol;
475987828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
47604e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
47614e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
4762719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
47634e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
47644e2b4712SSatish Balay 
4765ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
4766ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
4767ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
47686bce7ff8SHong Zhang 
476941df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
47708661488fSKris Buschelman   PetscFunctionReturn(0);
47718661488fSKris Buschelman }
47728661488fSKris Buschelman 
4773732ee342SKris Buschelman #undef __FUNCT__
47747e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
4775dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
47767e7071cdSKris Buschelman {
477712272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
477812272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
47795a9542e3SKris Buschelman   PetscFunctionBegin;
47807cf1b8d3SKris Buschelman   /* Undo Column scaling */
47817cf1b8d3SKris Buschelman /*    while (nz--) { */
47827cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
47837cf1b8d3SKris Buschelman /*    } */
4784c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
4785c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
47867cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
47877cf1b8d3SKris Buschelman }
47887cf1b8d3SKris Buschelman 
47897cf1b8d3SKris Buschelman #undef __FUNCT__
47907cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
4791dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
47927cf1b8d3SKris Buschelman {
47937cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4794b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
47952aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
47965a9542e3SKris Buschelman   PetscFunctionBegin;
47970b9da03eSKris Buschelman   /* Is this really necessary? */
479820235379SKris Buschelman   while (nz--) {
47990b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
48007e7071cdSKris Buschelman   }
4801c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
48027e7071cdSKris Buschelman   PetscFunctionReturn(0);
48037e7071cdSKris Buschelman }
48047e7071cdSKris Buschelman 
4805732ee342SKris Buschelman 
4806