xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision cee9d6f295ef30f06a70cc94141c40a93dc7ea43)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16dfbe8321SBarry Smith   PetscErrorCode ierr;
17690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18690b6cddSBarry Smith   PetscInt       *diag = a->diag;
19f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2087828ca2SBarry Smith   PetscScalar    s1,*x,*b;
21f1af5d2fSBarry Smith 
22f1af5d2fSBarry Smith   PetscFunctionBegin;
23ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26f1af5d2fSBarry Smith 
27f1af5d2fSBarry Smith   /* forward solve the U^T */
28f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
29f1af5d2fSBarry Smith 
30f1af5d2fSBarry Smith     v     = aa + diag[i];
31f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
32ef66eb69SBarry Smith     s1    = (*v++)*x[i];
33f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
34f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
35f1af5d2fSBarry Smith     while (nz--) {
36f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
37f1af5d2fSBarry Smith     }
38f1af5d2fSBarry Smith     x[i]   = s1;
39f1af5d2fSBarry Smith   }
40f1af5d2fSBarry Smith   /* backward solve the L^T */
41f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
42f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
43f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
44f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
45f1af5d2fSBarry Smith     s1   = x[i];
46f1af5d2fSBarry Smith     while (nz--) {
47f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
48f1af5d2fSBarry Smith     }
49f1af5d2fSBarry Smith   }
501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53f1af5d2fSBarry Smith   PetscFunctionReturn(0);
54f1af5d2fSBarry Smith }
55f1af5d2fSBarry Smith 
564a2ae208SSatish Balay #undef __FUNCT__
574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59f1af5d2fSBarry Smith {
60f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61dfbe8321SBarry Smith   PetscErrorCode ierr;
62690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
64f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6587828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6687828ca2SBarry Smith   PetscScalar    *x,*b;
67f1af5d2fSBarry Smith 
68f1af5d2fSBarry Smith   PetscFunctionBegin;
69ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
701ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72f1af5d2fSBarry Smith 
73f1af5d2fSBarry Smith   /* forward solve the U^T */
74f1af5d2fSBarry Smith   idx = 0;
75f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
76f1af5d2fSBarry Smith 
77f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
78f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
79ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
80f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
81f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
82f1af5d2fSBarry Smith     v += 4;
83f1af5d2fSBarry Smith 
84f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
85f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
86f1af5d2fSBarry Smith     while (nz--) {
87f1af5d2fSBarry Smith       oidx = 2*(*vi++);
88f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90f1af5d2fSBarry Smith       v  += 4;
91f1af5d2fSBarry Smith     }
92f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
93f1af5d2fSBarry Smith     idx += 2;
94f1af5d2fSBarry Smith   }
95f1af5d2fSBarry Smith   /* backward solve the L^T */
96f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
97f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
98f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
99f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
100f1af5d2fSBarry Smith     idt  = 2*i;
101f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
102f1af5d2fSBarry Smith     while (nz--) {
103f1af5d2fSBarry Smith       idx   = 2*(*vi--);
104f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106f1af5d2fSBarry Smith       v -= 4;
107f1af5d2fSBarry Smith     }
108f1af5d2fSBarry Smith   }
1091ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
112f1af5d2fSBarry Smith   PetscFunctionReturn(0);
113f1af5d2fSBarry Smith }
114f1af5d2fSBarry Smith 
1154a2ae208SSatish Balay #undef __FUNCT__
1164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118f1af5d2fSBarry Smith {
119f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120dfbe8321SBarry Smith   PetscErrorCode ierr;
121690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
123f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12487828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12587828ca2SBarry Smith   PetscScalar    *x,*b;
126f1af5d2fSBarry Smith 
127f1af5d2fSBarry Smith   PetscFunctionBegin;
128ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1291ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith   /* forward solve the U^T */
133f1af5d2fSBarry Smith   idx = 0;
134f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
135f1af5d2fSBarry Smith 
136f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
137f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
138ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142f1af5d2fSBarry Smith     v += 9;
143f1af5d2fSBarry Smith 
144f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
145f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
146f1af5d2fSBarry Smith     while (nz--) {
147f1af5d2fSBarry Smith       oidx = 3*(*vi++);
148f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151f1af5d2fSBarry Smith       v  += 9;
152f1af5d2fSBarry Smith     }
153f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154f1af5d2fSBarry Smith     idx += 3;
155f1af5d2fSBarry Smith   }
156f1af5d2fSBarry Smith   /* backward solve the L^T */
157f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
158f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
159f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
160f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
161f1af5d2fSBarry Smith     idt  = 3*i;
162f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163f1af5d2fSBarry Smith     while (nz--) {
164f1af5d2fSBarry Smith       idx   = 3*(*vi--);
165f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168f1af5d2fSBarry Smith       v -= 9;
169f1af5d2fSBarry Smith     }
170f1af5d2fSBarry Smith   }
1711ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
174f1af5d2fSBarry Smith   PetscFunctionReturn(0);
175f1af5d2fSBarry Smith }
176f1af5d2fSBarry Smith 
1774a2ae208SSatish Balay #undef __FUNCT__
1784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180f1af5d2fSBarry Smith {
181f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182dfbe8321SBarry Smith   PetscErrorCode ierr;
183690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
185f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18787828ca2SBarry Smith   PetscScalar    *x,*b;
188f1af5d2fSBarry Smith 
189f1af5d2fSBarry Smith   PetscFunctionBegin;
190ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith   /* forward solve the U^T */
195f1af5d2fSBarry Smith   idx = 0;
196f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
197f1af5d2fSBarry Smith 
198f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
199f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
200ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205f1af5d2fSBarry Smith     v += 16;
206f1af5d2fSBarry Smith 
207f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
208f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
209f1af5d2fSBarry Smith     while (nz--) {
210f1af5d2fSBarry Smith       oidx = 4*(*vi++);
211f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215f1af5d2fSBarry Smith       v  += 16;
216f1af5d2fSBarry Smith     }
217f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218f1af5d2fSBarry Smith     idx += 4;
219f1af5d2fSBarry Smith   }
220f1af5d2fSBarry Smith   /* backward solve the L^T */
221f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
222f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
223f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
224f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
225f1af5d2fSBarry Smith     idt  = 4*i;
226f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227f1af5d2fSBarry Smith     while (nz--) {
228f1af5d2fSBarry Smith       idx   = 4*(*vi--);
229f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233f1af5d2fSBarry Smith       v -= 16;
234f1af5d2fSBarry Smith     }
235f1af5d2fSBarry Smith   }
2361ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
239f1af5d2fSBarry Smith   PetscFunctionReturn(0);
240f1af5d2fSBarry Smith }
241f1af5d2fSBarry Smith 
2424a2ae208SSatish Balay #undef __FUNCT__
2434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245f1af5d2fSBarry Smith {
246f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247dfbe8321SBarry Smith   PetscErrorCode ierr;
248690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
250f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25287828ca2SBarry Smith   PetscScalar    *x,*b;
253f1af5d2fSBarry Smith 
254f1af5d2fSBarry Smith   PetscFunctionBegin;
255ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2561ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2571ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258f1af5d2fSBarry Smith 
259f1af5d2fSBarry Smith   /* forward solve the U^T */
260f1af5d2fSBarry Smith   idx = 0;
261f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
262f1af5d2fSBarry Smith 
263f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
264f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
265ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271f1af5d2fSBarry Smith     v += 25;
272f1af5d2fSBarry Smith 
273f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
274f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
275f1af5d2fSBarry Smith     while (nz--) {
276f1af5d2fSBarry Smith       oidx = 5*(*vi++);
277f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282f1af5d2fSBarry Smith       v  += 25;
283f1af5d2fSBarry Smith     }
284f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285f1af5d2fSBarry Smith     idx += 5;
286f1af5d2fSBarry Smith   }
287f1af5d2fSBarry Smith   /* backward solve the L^T */
288f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
289f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
290f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
291f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
292f1af5d2fSBarry Smith     idt  = 5*i;
293f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294f1af5d2fSBarry Smith     while (nz--) {
295f1af5d2fSBarry Smith       idx   = 5*(*vi--);
296f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301f1af5d2fSBarry Smith       v -= 25;
302f1af5d2fSBarry Smith     }
303f1af5d2fSBarry Smith   }
3041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
307f1af5d2fSBarry Smith   PetscFunctionReturn(0);
308f1af5d2fSBarry Smith }
309f1af5d2fSBarry Smith 
3104a2ae208SSatish Balay #undef __FUNCT__
3114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313f1af5d2fSBarry Smith {
314f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315dfbe8321SBarry Smith   PetscErrorCode ierr;
316690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
318f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
31987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32087828ca2SBarry Smith   PetscScalar    *x,*b;
321f1af5d2fSBarry Smith 
322f1af5d2fSBarry Smith   PetscFunctionBegin;
323ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326f1af5d2fSBarry Smith 
327f1af5d2fSBarry Smith   /* forward solve the U^T */
328f1af5d2fSBarry Smith   idx = 0;
329f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
330f1af5d2fSBarry Smith 
331f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
332f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
333ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334ef66eb69SBarry Smith     x6    = x[5+idx];
335f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341f1af5d2fSBarry Smith     v += 36;
342f1af5d2fSBarry Smith 
343f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
344f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
345f1af5d2fSBarry Smith     while (nz--) {
346f1af5d2fSBarry Smith       oidx = 6*(*vi++);
347f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353f1af5d2fSBarry Smith       v  += 36;
354f1af5d2fSBarry Smith     }
355f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356f1af5d2fSBarry Smith     x[5+idx] = s6;
357f1af5d2fSBarry Smith     idx += 6;
358f1af5d2fSBarry Smith   }
359f1af5d2fSBarry Smith   /* backward solve the L^T */
360f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
361f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
362f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
363f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
364f1af5d2fSBarry Smith     idt  = 6*i;
365f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366f1af5d2fSBarry Smith     s6 = x[5+idt];
367f1af5d2fSBarry Smith     while (nz--) {
368f1af5d2fSBarry Smith       idx   = 6*(*vi--);
369f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375f1af5d2fSBarry Smith       v -= 36;
376f1af5d2fSBarry Smith     }
377f1af5d2fSBarry Smith   }
3781ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
381f1af5d2fSBarry Smith   PetscFunctionReturn(0);
382f1af5d2fSBarry Smith }
383f1af5d2fSBarry Smith 
3844a2ae208SSatish Balay #undef __FUNCT__
3854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387f1af5d2fSBarry Smith {
388f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389dfbe8321SBarry Smith   PetscErrorCode ierr;
390690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
392f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39487828ca2SBarry Smith   PetscScalar    *x,*b;
395f1af5d2fSBarry Smith 
396f1af5d2fSBarry Smith   PetscFunctionBegin;
397ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3981ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3991ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400f1af5d2fSBarry Smith 
401f1af5d2fSBarry Smith   /* forward solve the U^T */
402f1af5d2fSBarry Smith   idx = 0;
403f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
404f1af5d2fSBarry Smith 
405f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
406f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
407ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
409f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416f1af5d2fSBarry Smith     v += 49;
417f1af5d2fSBarry Smith 
418f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
419f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
420f1af5d2fSBarry Smith     while (nz--) {
421f1af5d2fSBarry Smith       oidx = 7*(*vi++);
422f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429f1af5d2fSBarry Smith       v  += 49;
430f1af5d2fSBarry Smith     }
431f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
433f1af5d2fSBarry Smith     idx += 7;
434f1af5d2fSBarry Smith   }
435f1af5d2fSBarry Smith   /* backward solve the L^T */
436f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
437f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
438f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
439f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
440f1af5d2fSBarry Smith     idt  = 7*i;
441f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
443f1af5d2fSBarry Smith     while (nz--) {
444f1af5d2fSBarry Smith       idx   = 7*(*vi--);
445f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452f1af5d2fSBarry Smith       v -= 49;
453f1af5d2fSBarry Smith     }
454f1af5d2fSBarry Smith   }
4551ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
458f1af5d2fSBarry Smith   PetscFunctionReturn(0);
459f1af5d2fSBarry Smith }
460f1af5d2fSBarry Smith 
461f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4624a2ae208SSatish Balay #undef __FUNCT__
4634a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465f1af5d2fSBarry Smith {
466f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4686849ba73SBarry Smith   PetscErrorCode ierr;
4695d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4705d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471690b6cddSBarry Smith   PetscInt       *diag = a->diag;
472f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47387828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
474f1af5d2fSBarry Smith 
475f1af5d2fSBarry Smith   PetscFunctionBegin;
4761ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478f1af5d2fSBarry Smith   t  = a->solve_work;
479f1af5d2fSBarry Smith 
480f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
482f1af5d2fSBarry Smith 
483f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
484f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
485f1af5d2fSBarry Smith     t[i] = b[c[i]];
486f1af5d2fSBarry Smith   }
487f1af5d2fSBarry Smith 
488f1af5d2fSBarry Smith   /* forward solve the U^T */
489f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
490f1af5d2fSBarry Smith 
491f1af5d2fSBarry Smith     v     = aa + diag[i];
492f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
493f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
494f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
495f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
496f1af5d2fSBarry Smith     while (nz--) {
497f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
498f1af5d2fSBarry Smith     }
499f1af5d2fSBarry Smith     t[i]   = s1;
500f1af5d2fSBarry Smith   }
501f1af5d2fSBarry Smith   /* backward solve the L^T */
502f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
503f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
504f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
505f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
506f1af5d2fSBarry Smith     s1   = t[i];
507f1af5d2fSBarry Smith     while (nz--) {
508f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
509f1af5d2fSBarry Smith     }
510f1af5d2fSBarry Smith   }
511f1af5d2fSBarry Smith 
512f1af5d2fSBarry Smith   /* copy t into x according to permutation */
513f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
514f1af5d2fSBarry Smith     x[r[i]]   = t[i];
515f1af5d2fSBarry Smith   }
516f1af5d2fSBarry Smith 
517f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
518f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5191ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
522f1af5d2fSBarry Smith   PetscFunctionReturn(0);
523f1af5d2fSBarry Smith }
524f1af5d2fSBarry Smith 
5254a2ae208SSatish Balay #undef __FUNCT__
5264a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
527dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528f1af5d2fSBarry Smith {
529f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5316849ba73SBarry Smith   PetscErrorCode ierr;
5325d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5335d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
535f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53687828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
538f1af5d2fSBarry Smith 
539f1af5d2fSBarry Smith   PetscFunctionBegin;
5401ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
542f1af5d2fSBarry Smith   t  = a->solve_work;
543f1af5d2fSBarry Smith 
544f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
545f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
546f1af5d2fSBarry Smith 
547f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
548f1af5d2fSBarry Smith   ii = 0;
549f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
550f1af5d2fSBarry Smith     ic      = 2*c[i];
551f1af5d2fSBarry Smith     t[ii]   = b[ic];
552f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
553f1af5d2fSBarry Smith     ii += 2;
554f1af5d2fSBarry Smith   }
555f1af5d2fSBarry Smith 
556f1af5d2fSBarry Smith   /* forward solve the U^T */
557f1af5d2fSBarry Smith   idx = 0;
558f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
559f1af5d2fSBarry Smith 
560f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
561f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
562f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
563f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
564f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
565f1af5d2fSBarry Smith     v += 4;
566f1af5d2fSBarry Smith 
567f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
568f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
569f1af5d2fSBarry Smith     while (nz--) {
570f1af5d2fSBarry Smith       oidx = 2*(*vi++);
571f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
572f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
573f1af5d2fSBarry Smith       v  += 4;
574f1af5d2fSBarry Smith     }
575f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
576f1af5d2fSBarry Smith     idx += 2;
577f1af5d2fSBarry Smith   }
578f1af5d2fSBarry Smith   /* backward solve the L^T */
579f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
580f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
581f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
582f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
583f1af5d2fSBarry Smith     idt  = 2*i;
584f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
585f1af5d2fSBarry Smith     while (nz--) {
586f1af5d2fSBarry Smith       idx   = 2*(*vi--);
587f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
588f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
589f1af5d2fSBarry Smith       v -= 4;
590f1af5d2fSBarry Smith     }
591f1af5d2fSBarry Smith   }
592f1af5d2fSBarry Smith 
593f1af5d2fSBarry Smith   /* copy t into x according to permutation */
594f1af5d2fSBarry Smith   ii = 0;
595f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
596f1af5d2fSBarry Smith     ir      = 2*r[i];
597f1af5d2fSBarry Smith     x[ir]   = t[ii];
598f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
599f1af5d2fSBarry Smith     ii += 2;
600f1af5d2fSBarry Smith   }
601f1af5d2fSBarry Smith 
602f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
603f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
606dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
607f1af5d2fSBarry Smith   PetscFunctionReturn(0);
608f1af5d2fSBarry Smith }
609f1af5d2fSBarry Smith 
6104a2ae208SSatish Balay #undef __FUNCT__
6114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
612dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613f1af5d2fSBarry Smith {
614f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
615f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6166849ba73SBarry Smith   PetscErrorCode ierr;
6175d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6185d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
620f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62187828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
623f1af5d2fSBarry Smith 
624f1af5d2fSBarry Smith   PetscFunctionBegin;
6251ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6261ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
627f1af5d2fSBarry Smith   t  = a->solve_work;
628f1af5d2fSBarry Smith 
629f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
630f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
631f1af5d2fSBarry Smith 
632f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
633f1af5d2fSBarry Smith   ii = 0;
634f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
635f1af5d2fSBarry Smith     ic      = 3*c[i];
636f1af5d2fSBarry Smith     t[ii]   = b[ic];
637f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
638f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
639f1af5d2fSBarry Smith     ii += 3;
640f1af5d2fSBarry Smith   }
641f1af5d2fSBarry Smith 
642f1af5d2fSBarry Smith   /* forward solve the U^T */
643f1af5d2fSBarry Smith   idx = 0;
644f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
645f1af5d2fSBarry Smith 
646f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
647f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
648f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
649f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
650f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
651f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
652f1af5d2fSBarry Smith     v += 9;
653f1af5d2fSBarry Smith 
654f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
655f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
656f1af5d2fSBarry Smith     while (nz--) {
657f1af5d2fSBarry Smith       oidx = 3*(*vi++);
658f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
659f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
660f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661f1af5d2fSBarry Smith       v  += 9;
662f1af5d2fSBarry Smith     }
663f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
664f1af5d2fSBarry Smith     idx += 3;
665f1af5d2fSBarry Smith   }
666f1af5d2fSBarry Smith   /* backward solve the L^T */
667f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
668f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
669f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
670f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
671f1af5d2fSBarry Smith     idt  = 3*i;
672f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
673f1af5d2fSBarry Smith     while (nz--) {
674f1af5d2fSBarry Smith       idx   = 3*(*vi--);
675f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
676f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
677f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678f1af5d2fSBarry Smith       v -= 9;
679f1af5d2fSBarry Smith     }
680f1af5d2fSBarry Smith   }
681f1af5d2fSBarry Smith 
682f1af5d2fSBarry Smith   /* copy t into x according to permutation */
683f1af5d2fSBarry Smith   ii = 0;
684f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
685f1af5d2fSBarry Smith     ir      = 3*r[i];
686f1af5d2fSBarry Smith     x[ir]   = t[ii];
687f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
688f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
689f1af5d2fSBarry Smith     ii += 3;
690f1af5d2fSBarry Smith   }
691f1af5d2fSBarry Smith 
692f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
693f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6941ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
696dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
697f1af5d2fSBarry Smith   PetscFunctionReturn(0);
698f1af5d2fSBarry Smith }
699f1af5d2fSBarry Smith 
7004a2ae208SSatish Balay #undef __FUNCT__
7014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
702dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703f1af5d2fSBarry Smith {
704f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
705f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7066849ba73SBarry Smith   PetscErrorCode ierr;
7075d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7085d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
710f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
713f1af5d2fSBarry Smith 
714f1af5d2fSBarry Smith   PetscFunctionBegin;
7151ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717f1af5d2fSBarry Smith   t  = a->solve_work;
718f1af5d2fSBarry Smith 
719f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
720f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
721f1af5d2fSBarry Smith 
722f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
723f1af5d2fSBarry Smith   ii = 0;
724f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
725f1af5d2fSBarry Smith     ic      = 4*c[i];
726f1af5d2fSBarry Smith     t[ii]   = b[ic];
727f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
728f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
729f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
730f1af5d2fSBarry Smith     ii += 4;
731f1af5d2fSBarry Smith   }
732f1af5d2fSBarry Smith 
733f1af5d2fSBarry Smith   /* forward solve the U^T */
734f1af5d2fSBarry Smith   idx = 0;
735f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
736f1af5d2fSBarry Smith 
737f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
738f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
739f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
740f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
741f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
742f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
743f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744f1af5d2fSBarry Smith     v += 16;
745f1af5d2fSBarry Smith 
746f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
747f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
748f1af5d2fSBarry Smith     while (nz--) {
749f1af5d2fSBarry Smith       oidx = 4*(*vi++);
750f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
751f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
752f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754f1af5d2fSBarry Smith       v  += 16;
755f1af5d2fSBarry Smith     }
756f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757f1af5d2fSBarry Smith     idx += 4;
758f1af5d2fSBarry Smith   }
759f1af5d2fSBarry Smith   /* backward solve the L^T */
760f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
761f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
762f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
763f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
764f1af5d2fSBarry Smith     idt  = 4*i;
765f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766f1af5d2fSBarry Smith     while (nz--) {
767f1af5d2fSBarry Smith       idx   = 4*(*vi--);
768f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
769f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
770f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772f1af5d2fSBarry Smith       v -= 16;
773f1af5d2fSBarry Smith     }
774f1af5d2fSBarry Smith   }
775f1af5d2fSBarry Smith 
776f1af5d2fSBarry Smith   /* copy t into x according to permutation */
777f1af5d2fSBarry Smith   ii = 0;
778f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
779f1af5d2fSBarry Smith     ir      = 4*r[i];
780f1af5d2fSBarry Smith     x[ir]   = t[ii];
781f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
782f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
783f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
784f1af5d2fSBarry Smith     ii += 4;
785f1af5d2fSBarry Smith   }
786f1af5d2fSBarry Smith 
787f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
792f1af5d2fSBarry Smith   PetscFunctionReturn(0);
793f1af5d2fSBarry Smith }
794f1af5d2fSBarry Smith 
7954a2ae208SSatish Balay #undef __FUNCT__
7964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
797dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798f1af5d2fSBarry Smith {
799f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
800f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8016849ba73SBarry Smith   PetscErrorCode ierr;
8025d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8035d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
805f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
808f1af5d2fSBarry Smith 
809f1af5d2fSBarry Smith   PetscFunctionBegin;
8101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
812f1af5d2fSBarry Smith   t  = a->solve_work;
813f1af5d2fSBarry Smith 
814f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
815f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
816f1af5d2fSBarry Smith 
817f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
818f1af5d2fSBarry Smith   ii = 0;
819f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
820f1af5d2fSBarry Smith     ic      = 5*c[i];
821f1af5d2fSBarry Smith     t[ii]   = b[ic];
822f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
823f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
824f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
825f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
826f1af5d2fSBarry Smith     ii += 5;
827f1af5d2fSBarry Smith   }
828f1af5d2fSBarry Smith 
829f1af5d2fSBarry Smith   /* forward solve the U^T */
830f1af5d2fSBarry Smith   idx = 0;
831f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
832f1af5d2fSBarry Smith 
833f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
834f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
835f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
837f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
838f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841f1af5d2fSBarry Smith     v += 25;
842f1af5d2fSBarry Smith 
843f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
844f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
845f1af5d2fSBarry Smith     while (nz--) {
846f1af5d2fSBarry Smith       oidx = 5*(*vi++);
847f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852f1af5d2fSBarry Smith       v  += 25;
853f1af5d2fSBarry Smith     }
854f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855f1af5d2fSBarry Smith     idx += 5;
856f1af5d2fSBarry Smith   }
857f1af5d2fSBarry Smith   /* backward solve the L^T */
858f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
859f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
860f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
861f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
862f1af5d2fSBarry Smith     idt  = 5*i;
863f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864f1af5d2fSBarry Smith     while (nz--) {
865f1af5d2fSBarry Smith       idx   = 5*(*vi--);
866f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
867f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
868f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871f1af5d2fSBarry Smith       v -= 25;
872f1af5d2fSBarry Smith     }
873f1af5d2fSBarry Smith   }
874f1af5d2fSBarry Smith 
875f1af5d2fSBarry Smith   /* copy t into x according to permutation */
876f1af5d2fSBarry Smith   ii = 0;
877f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
878f1af5d2fSBarry Smith     ir      = 5*r[i];
879f1af5d2fSBarry Smith     x[ir]   = t[ii];
880f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
881f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
882f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
883f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
884f1af5d2fSBarry Smith     ii += 5;
885f1af5d2fSBarry Smith   }
886f1af5d2fSBarry Smith 
887f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
888f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
891dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
892f1af5d2fSBarry Smith   PetscFunctionReturn(0);
893f1af5d2fSBarry Smith }
894f1af5d2fSBarry Smith 
8954a2ae208SSatish Balay #undef __FUNCT__
8964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
897dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898f1af5d2fSBarry Smith {
899f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
900f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9016849ba73SBarry Smith   PetscErrorCode ierr;
9025d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9035d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
905f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
908f1af5d2fSBarry Smith 
909f1af5d2fSBarry Smith   PetscFunctionBegin;
9101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
912f1af5d2fSBarry Smith   t  = a->solve_work;
913f1af5d2fSBarry Smith 
914f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
915f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
916f1af5d2fSBarry Smith 
917f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
918f1af5d2fSBarry Smith   ii = 0;
919f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
920f1af5d2fSBarry Smith     ic      = 6*c[i];
921f1af5d2fSBarry Smith     t[ii]   = b[ic];
922f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
923f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
924f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
925f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
926f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
927f1af5d2fSBarry Smith     ii += 6;
928f1af5d2fSBarry Smith   }
929f1af5d2fSBarry Smith 
930f1af5d2fSBarry Smith   /* forward solve the U^T */
931f1af5d2fSBarry Smith   idx = 0;
932f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
933f1af5d2fSBarry Smith 
934f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
935f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
936f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937f1af5d2fSBarry Smith     x6    = t[5+idx];
938f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
939f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
940f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944f1af5d2fSBarry Smith     v += 36;
945f1af5d2fSBarry Smith 
946f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
947f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
948f1af5d2fSBarry Smith     while (nz--) {
949f1af5d2fSBarry Smith       oidx = 6*(*vi++);
950f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
951f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
952f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956f1af5d2fSBarry Smith       v  += 36;
957f1af5d2fSBarry Smith     }
958f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959f1af5d2fSBarry Smith     t[5+idx] = s6;
960f1af5d2fSBarry Smith     idx += 6;
961f1af5d2fSBarry Smith   }
962f1af5d2fSBarry Smith   /* backward solve the L^T */
963f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
964f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
965f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
966f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
967f1af5d2fSBarry Smith     idt  = 6*i;
968f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969f1af5d2fSBarry Smith     s6 = t[5+idt];
970f1af5d2fSBarry Smith     while (nz--) {
971f1af5d2fSBarry Smith       idx   = 6*(*vi--);
972f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
973f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
974f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978f1af5d2fSBarry Smith       v -= 36;
979f1af5d2fSBarry Smith     }
980f1af5d2fSBarry Smith   }
981f1af5d2fSBarry Smith 
982f1af5d2fSBarry Smith   /* copy t into x according to permutation */
983f1af5d2fSBarry Smith   ii = 0;
984f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
985f1af5d2fSBarry Smith     ir      = 6*r[i];
986f1af5d2fSBarry Smith     x[ir]   = t[ii];
987f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
988f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
989f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
990f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
991f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
992f1af5d2fSBarry Smith     ii += 6;
993f1af5d2fSBarry Smith   }
994f1af5d2fSBarry Smith 
995f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
996f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9971ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1000f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1001f1af5d2fSBarry Smith }
1002f1af5d2fSBarry Smith 
10034a2ae208SSatish Balay #undef __FUNCT__
10044a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1005dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006f1af5d2fSBarry Smith {
1007f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1008f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10096849ba73SBarry Smith   PetscErrorCode ierr;
10105d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10115d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1013f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1016f1af5d2fSBarry Smith 
1017f1af5d2fSBarry Smith   PetscFunctionBegin;
10181ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1020f1af5d2fSBarry Smith   t  = a->solve_work;
1021f1af5d2fSBarry Smith 
1022f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1023f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1024f1af5d2fSBarry Smith 
1025f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1026f1af5d2fSBarry Smith   ii = 0;
1027f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1028f1af5d2fSBarry Smith     ic      = 7*c[i];
1029f1af5d2fSBarry Smith     t[ii]   = b[ic];
1030f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1031f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1032f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1033f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1034f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1035f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1036f1af5d2fSBarry Smith     ii += 7;
1037f1af5d2fSBarry Smith   }
1038f1af5d2fSBarry Smith 
1039f1af5d2fSBarry Smith   /* forward solve the U^T */
1040f1af5d2fSBarry Smith   idx = 0;
1041f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1042f1af5d2fSBarry Smith 
1043f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1044f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1045f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1047f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1048f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054f1af5d2fSBarry Smith     v += 49;
1055f1af5d2fSBarry Smith 
1056f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1057f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1058f1af5d2fSBarry Smith     while (nz--) {
1059f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1060f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067f1af5d2fSBarry Smith       v  += 49;
1068f1af5d2fSBarry Smith     }
1069f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1071f1af5d2fSBarry Smith     idx += 7;
1072f1af5d2fSBarry Smith   }
1073f1af5d2fSBarry Smith   /* backward solve the L^T */
1074f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1075f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1076f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1077f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1078f1af5d2fSBarry Smith     idt  = 7*i;
1079f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1081f1af5d2fSBarry Smith     while (nz--) {
1082f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1083f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1084f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090f1af5d2fSBarry Smith       v -= 49;
1091f1af5d2fSBarry Smith     }
1092f1af5d2fSBarry Smith   }
1093f1af5d2fSBarry Smith 
1094f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1095f1af5d2fSBarry Smith   ii = 0;
1096f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1097f1af5d2fSBarry Smith     ir      = 7*r[i];
1098f1af5d2fSBarry Smith     x[ir]   = t[ii];
1099f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1100f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1101f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1102f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1103f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1104f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1105f1af5d2fSBarry Smith     ii += 7;
1106f1af5d2fSBarry Smith   }
1107f1af5d2fSBarry Smith 
1108f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1109f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11101ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1112dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1113f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1114f1af5d2fSBarry Smith }
1115f1af5d2fSBarry Smith 
11164e2b4712SSatish Balay /* ----------------------------------------------------------- */
11174a2ae208SSatish Balay #undef __FUNCT__
11184a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1119dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11204e2b4712SSatish Balay {
11214e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11224e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11236849ba73SBarry Smith   PetscErrorCode ierr;
11245d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11255d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11265d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11273f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
112887828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11294e2b4712SSatish Balay 
11304e2b4712SSatish Balay   PetscFunctionBegin;
11311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133f1af5d2fSBarry Smith   t  = a->solve_work;
11344e2b4712SSatish Balay 
11354e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11364e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11374e2b4712SSatish Balay 
11384e2b4712SSatish Balay   /* forward solve the lower triangular */
113987828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11404e2b4712SSatish Balay   for (i=1; i<n; i++) {
11414e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11424e2b4712SSatish Balay     vi  = aj + ai[i];
11434e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1144f1af5d2fSBarry Smith     s = t + bs*i;
114587828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11464e2b4712SSatish Balay     while (nz--) {
1147f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11484e2b4712SSatish Balay       v += bs2;
11494e2b4712SSatish Balay     }
11504e2b4712SSatish Balay   }
11514e2b4712SSatish Balay   /* backward solve the upper triangular */
1152d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11534e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11544e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11554e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11564e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115787828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11584e2b4712SSatish Balay     while (nz--) {
1159f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11604e2b4712SSatish Balay       v += bs2;
11614e2b4712SSatish Balay     }
1162f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116387828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11644e2b4712SSatish Balay   }
11654e2b4712SSatish Balay 
11664e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11674e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11681ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11691ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11714e2b4712SSatish Balay   PetscFunctionReturn(0);
11724e2b4712SSatish Balay }
11734e2b4712SSatish Balay 
11744a2ae208SSatish Balay #undef __FUNCT__
11754a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1176dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11774e2b4712SSatish Balay {
11784e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11794e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11806849ba73SBarry Smith   PetscErrorCode ierr;
11815d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11825d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11833f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11864e2b4712SSatish Balay 
11874e2b4712SSatish Balay   PetscFunctionBegin;
11881ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1190f1af5d2fSBarry Smith   t  = a->solve_work;
11914e2b4712SSatish Balay 
11924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11944e2b4712SSatish Balay 
11954e2b4712SSatish Balay   /* forward solve the lower triangular */
11964e2b4712SSatish Balay   idx    = 7*(*r++);
1197f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1198f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12004e2b4712SSatish Balay 
12014e2b4712SSatish Balay   for (i=1; i<n; i++) {
12024e2b4712SSatish Balay     v     = aa + 49*ai[i];
12034e2b4712SSatish Balay     vi    = aj + ai[i];
12044e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12054e2b4712SSatish Balay     idx   = 7*(*r++);
1206f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12084e2b4712SSatish Balay     while (nz--) {
12094e2b4712SSatish Balay       idx   = 7*(*vi++);
1210f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1211f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1212f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1213f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12204e2b4712SSatish Balay       v += 49;
12214e2b4712SSatish Balay     }
12224e2b4712SSatish Balay     idx = 7*i;
1223f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1224f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12264e2b4712SSatish Balay   }
12274e2b4712SSatish Balay   /* backward solve the upper triangular */
12284e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12294e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12304e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12314e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12324e2b4712SSatish Balay     idt  = 7*i;
1233f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1234f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12364e2b4712SSatish Balay     while (nz--) {
12374e2b4712SSatish Balay       idx   = 7*(*vi++);
1238f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1239f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1241f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12484e2b4712SSatish Balay       v += 49;
12494e2b4712SSatish Balay     }
12504e2b4712SSatish Balay     idc = 7*(*c--);
12514e2b4712SSatish Balay     v   = aa + 49*diag[i];
1252f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1253f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12664e2b4712SSatish Balay   }
12674e2b4712SSatish Balay 
12684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1272dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12734e2b4712SSatish Balay   PetscFunctionReturn(0);
12744e2b4712SSatish Balay }
12754e2b4712SSatish Balay 
12764a2ae208SSatish Balay #undef __FUNCT__
12774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1278dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
127915091d37SBarry Smith {
128015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1281690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282dfbe8321SBarry Smith   PetscErrorCode    ierr;
1283690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1284d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1285d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286d9fead3dSBarry Smith   const PetscScalar *b;
128715091d37SBarry Smith 
128815091d37SBarry Smith   PetscFunctionBegin;
1289d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12901ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129115091d37SBarry Smith   /* forward solve the lower triangular */
129215091d37SBarry Smith   idx    = 0;
129315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
129415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
129515091d37SBarry Smith   x[6] = b[6+idx];
129615091d37SBarry Smith   for (i=1; i<n; i++) {
129715091d37SBarry Smith     v     =  aa + 49*ai[i];
129815091d37SBarry Smith     vi    =  aj + ai[i];
129915091d37SBarry Smith     nz    =  diag[i] - ai[i];
130015091d37SBarry Smith     idx   =  7*i;
1301f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1302f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303f1af5d2fSBarry Smith     s7  =  b[6+idx];
130415091d37SBarry Smith     while (nz--) {
130515091d37SBarry Smith       jdx   = 7*(*vi++);
130615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
130715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
130815091d37SBarry Smith       x7    = x[6+jdx];
1309f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
131615091d37SBarry Smith       v += 49;
131715091d37SBarry Smith      }
1318f1af5d2fSBarry Smith     x[idx]   = s1;
1319f1af5d2fSBarry Smith     x[1+idx] = s2;
1320f1af5d2fSBarry Smith     x[2+idx] = s3;
1321f1af5d2fSBarry Smith     x[3+idx] = s4;
1322f1af5d2fSBarry Smith     x[4+idx] = s5;
1323f1af5d2fSBarry Smith     x[5+idx] = s6;
1324f1af5d2fSBarry Smith     x[6+idx] = s7;
132515091d37SBarry Smith   }
132615091d37SBarry Smith   /* backward solve the upper triangular */
132715091d37SBarry Smith   for (i=n-1; i>=0; i--){
132815091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
132915091d37SBarry Smith     vi   = aj + diag[i] + 1;
133015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
133115091d37SBarry Smith     idt  = 7*i;
1332f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1333f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1334f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1335f1af5d2fSBarry Smith     s7 = x[6+idt];
133615091d37SBarry Smith     while (nz--) {
133715091d37SBarry Smith       idx   = 7*(*vi++);
133815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
133915091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
134015091d37SBarry Smith       x7    = x[6+idx];
1341f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134815091d37SBarry Smith       v += 49;
134915091d37SBarry Smith     }
135015091d37SBarry Smith     v        = aa + 49*diag[i];
1351f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1352f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1354f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1356f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1358f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1360f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1362f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1364f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
136515091d37SBarry Smith   }
136615091d37SBarry Smith 
1367d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1369dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
137015091d37SBarry Smith   PetscFunctionReturn(0);
137115091d37SBarry Smith }
137215091d37SBarry Smith 
13734a2ae208SSatish Balay #undef __FUNCT__
1374*cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
1375*cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1376*cee9d6f2SShri Abhyankar {
1377*cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1378*cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1379*cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1380*cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1381*cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1382*cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1383*cee9d6f2SShri Abhyankar     PetscScalar       *x;
1384*cee9d6f2SShri Abhyankar     const PetscScalar *b;
1385*cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1386*cee9d6f2SShri Abhyankar 
1387*cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1388*cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1389*cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1390*cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1391*cee9d6f2SShri Abhyankar     idx    = 0;
1392*cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1393*cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
1394*cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1395*cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1396*cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1397*cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1398*cee9d6f2SShri Abhyankar       idx   = bs*i;
1399*cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1400*cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
1401*cee9d6f2SShri Abhyankar        while (nz--) {
1402*cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
1403*cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1404*cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
1405*cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1406*cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1407*cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1408*cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1409*cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1410*cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1411*cee9d6f2SShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1412*cee9d6f2SShri Abhyankar           v   +=  bs2;
1413*cee9d6f2SShri Abhyankar         }
1414*cee9d6f2SShri Abhyankar 
1415*cee9d6f2SShri Abhyankar        x[idx]   = s1;
1416*cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1417*cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1418*cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1419*cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1420*cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1421*cee9d6f2SShri Abhyankar        x[6+idx] = s7;
1422*cee9d6f2SShri Abhyankar     }
1423*cee9d6f2SShri Abhyankar 
1424*cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1425*cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1426*cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1427*cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1428*cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1429*cee9d6f2SShri Abhyankar      idt = bs*i;
1430*cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1431*cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
1432*cee9d6f2SShri Abhyankar     while (nz--) {
1433*cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
1434*cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1435*cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
1436*cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
1437*cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
1438*cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
1439*cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
1440*cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
1441*cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
1442*cee9d6f2SShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
1443*cee9d6f2SShri Abhyankar         v   +=  bs2;
1444*cee9d6f2SShri Abhyankar     }
1445*cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1446*cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
1447*cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
1448*cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
1449*cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
1450*cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
1451*cee9d6f2SShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
1452*cee9d6f2SShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
1453*cee9d6f2SShri Abhyankar   }
1454*cee9d6f2SShri Abhyankar 
1455*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1456*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1457*cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1458*cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1459*cee9d6f2SShri Abhyankar }
1460*cee9d6f2SShri Abhyankar 
1461*cee9d6f2SShri Abhyankar #undef __FUNCT__
14624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1463dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
146415091d37SBarry Smith {
146515091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
146615091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
14676849ba73SBarry Smith   PetscErrorCode    ierr;
14685d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
14695d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1470d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1471d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1472d9fead3dSBarry Smith   const PetscScalar *b;
147315091d37SBarry Smith   PetscFunctionBegin;
1474d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14751ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1476f1af5d2fSBarry Smith   t  = a->solve_work;
147715091d37SBarry Smith 
147815091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
147915091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
148015091d37SBarry Smith 
148115091d37SBarry Smith   /* forward solve the lower triangular */
148215091d37SBarry Smith   idx    = 6*(*r++);
1483f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1484f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1485f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
148615091d37SBarry Smith   for (i=1; i<n; i++) {
148715091d37SBarry Smith     v     = aa + 36*ai[i];
148815091d37SBarry Smith     vi    = aj + ai[i];
148915091d37SBarry Smith     nz    = diag[i] - ai[i];
149015091d37SBarry Smith     idx   = 6*(*r++);
1491f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1492f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
149315091d37SBarry Smith     while (nz--) {
149415091d37SBarry Smith       idx   = 6*(*vi++);
1495f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1496f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1497f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150315091d37SBarry Smith       v += 36;
150415091d37SBarry Smith     }
150515091d37SBarry Smith     idx = 6*i;
1506f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1507f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1508f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
150915091d37SBarry Smith   }
151015091d37SBarry Smith   /* backward solve the upper triangular */
151115091d37SBarry Smith   for (i=n-1; i>=0; i--){
151215091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
151315091d37SBarry Smith     vi   = aj + diag[i] + 1;
151415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
151515091d37SBarry Smith     idt  = 6*i;
1516f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1517f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1518f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
151915091d37SBarry Smith     while (nz--) {
152015091d37SBarry Smith       idx   = 6*(*vi++);
1521f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1522f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1523f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1524f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1525f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1526f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1527f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1528f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1529f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
153015091d37SBarry Smith       v += 36;
153115091d37SBarry Smith     }
153215091d37SBarry Smith     idc = 6*(*c--);
153315091d37SBarry Smith     v   = aa + 36*diag[i];
1534f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1535f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1536f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1537f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1538f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1539f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1540f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1541f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1542f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1543f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1544f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1545f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
154615091d37SBarry Smith   }
154715091d37SBarry Smith 
154815091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
154915091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1550d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1552dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
155315091d37SBarry Smith   PetscFunctionReturn(0);
155415091d37SBarry Smith }
155515091d37SBarry Smith 
15564a2ae208SSatish Balay #undef __FUNCT__
15574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1558dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
155915091d37SBarry Smith {
156015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1561690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1562dfbe8321SBarry Smith   PetscErrorCode    ierr;
1563690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1564d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1565d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1566d9fead3dSBarry Smith   const PetscScalar *b;
156715091d37SBarry Smith 
156815091d37SBarry Smith   PetscFunctionBegin;
1569d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
157115091d37SBarry Smith   /* forward solve the lower triangular */
157215091d37SBarry Smith   idx    = 0;
157315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
157415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
157515091d37SBarry Smith   for (i=1; i<n; i++) {
157615091d37SBarry Smith     v     =  aa + 36*ai[i];
157715091d37SBarry Smith     vi    =  aj + ai[i];
157815091d37SBarry Smith     nz    =  diag[i] - ai[i];
157915091d37SBarry Smith     idx   =  6*i;
1580f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1581f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
158215091d37SBarry Smith     while (nz--) {
158315091d37SBarry Smith       jdx   = 6*(*vi++);
158415091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
158515091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1586f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1587f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1588f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1589f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1590f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1591f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
159215091d37SBarry Smith       v += 36;
159315091d37SBarry Smith      }
1594f1af5d2fSBarry Smith     x[idx]   = s1;
1595f1af5d2fSBarry Smith     x[1+idx] = s2;
1596f1af5d2fSBarry Smith     x[2+idx] = s3;
1597f1af5d2fSBarry Smith     x[3+idx] = s4;
1598f1af5d2fSBarry Smith     x[4+idx] = s5;
1599f1af5d2fSBarry Smith     x[5+idx] = s6;
160015091d37SBarry Smith   }
160115091d37SBarry Smith   /* backward solve the upper triangular */
160215091d37SBarry Smith   for (i=n-1; i>=0; i--){
160315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
160415091d37SBarry Smith     vi   = aj + diag[i] + 1;
160515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
160615091d37SBarry Smith     idt  = 6*i;
1607f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1608f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1609f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
161015091d37SBarry Smith     while (nz--) {
161115091d37SBarry Smith       idx   = 6*(*vi++);
161215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
161315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1614f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1615f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1616f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1617f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1618f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1619f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
162015091d37SBarry Smith       v += 36;
162115091d37SBarry Smith     }
162215091d37SBarry Smith     v        = aa + 36*diag[i];
1623f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1624f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1625f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1626f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1627f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1628f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
162915091d37SBarry Smith   }
163015091d37SBarry Smith 
1631d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16321ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
163415091d37SBarry Smith   PetscFunctionReturn(0);
163515091d37SBarry Smith }
163615091d37SBarry Smith 
16374a2ae208SSatish Balay #undef __FUNCT__
1638*cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
1639*cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1640*cee9d6f2SShri Abhyankar {
1641*cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1642*cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
1643*cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
1644*cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
1645*cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
1646*cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
1647*cee9d6f2SShri Abhyankar     PetscScalar       *x;
1648*cee9d6f2SShri Abhyankar     const PetscScalar *b;
1649*cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1650*cee9d6f2SShri Abhyankar 
1651*cee9d6f2SShri Abhyankar     PetscFunctionBegin;
1652*cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1653*cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1654*cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
1655*cee9d6f2SShri Abhyankar     idx    = 0;
1656*cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
1657*cee9d6f2SShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
1658*cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
1659*cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
1660*cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
1661*cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
1662*cee9d6f2SShri Abhyankar       idx   = bs*i;
1663*cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1664*cee9d6f2SShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
1665*cee9d6f2SShri Abhyankar        while (nz--) {
1666*cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
1667*cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
1668*cee9d6f2SShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
1669*cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1670*cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1671*cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1672*cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1673*cee9d6f2SShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1674*cee9d6f2SShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1675*cee9d6f2SShri Abhyankar           v   +=  bs2;
1676*cee9d6f2SShri Abhyankar         }
1677*cee9d6f2SShri Abhyankar 
1678*cee9d6f2SShri Abhyankar        x[idx]   = s1;
1679*cee9d6f2SShri Abhyankar        x[1+idx] = s2;
1680*cee9d6f2SShri Abhyankar        x[2+idx] = s3;
1681*cee9d6f2SShri Abhyankar        x[3+idx] = s4;
1682*cee9d6f2SShri Abhyankar        x[4+idx] = s5;
1683*cee9d6f2SShri Abhyankar        x[5+idx] = s6;
1684*cee9d6f2SShri Abhyankar     }
1685*cee9d6f2SShri Abhyankar 
1686*cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
1687*cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1688*cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
1689*cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
1690*cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1691*cee9d6f2SShri Abhyankar      idt = bs*i;
1692*cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
1693*cee9d6f2SShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
1694*cee9d6f2SShri Abhyankar     while (nz--) {
1695*cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
1696*cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
1697*cee9d6f2SShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
1698*cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
1699*cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
1700*cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
1701*cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
1702*cee9d6f2SShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
1703*cee9d6f2SShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
1704*cee9d6f2SShri Abhyankar         v   +=  bs2;
1705*cee9d6f2SShri Abhyankar     }
1706*cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1707*cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1708*cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1709*cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1710*cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1711*cee9d6f2SShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1712*cee9d6f2SShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
1713*cee9d6f2SShri Abhyankar   }
1714*cee9d6f2SShri Abhyankar 
1715*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1716*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1717*cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1718*cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1719*cee9d6f2SShri Abhyankar }
1720*cee9d6f2SShri Abhyankar #undef __FUNCT__
17214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1722dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
17234e2b4712SSatish Balay {
17244e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17254e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
17266849ba73SBarry Smith   PetscErrorCode    ierr;
17275d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
17285d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1729d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1730d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1731d9fead3dSBarry Smith   const PetscScalar *b;
17324e2b4712SSatish Balay 
17334e2b4712SSatish Balay   PetscFunctionBegin;
1734d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17351ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1736f1af5d2fSBarry Smith   t  = a->solve_work;
17374e2b4712SSatish Balay 
17384e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17394e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
17404e2b4712SSatish Balay 
17414e2b4712SSatish Balay   /* forward solve the lower triangular */
17424e2b4712SSatish Balay   idx    = 5*(*r++);
1743f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1744f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
17454e2b4712SSatish Balay   for (i=1; i<n; i++) {
17464e2b4712SSatish Balay     v     = aa + 25*ai[i];
17474e2b4712SSatish Balay     vi    = aj + ai[i];
17484e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17494e2b4712SSatish Balay     idx   = 5*(*r++);
1750f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1751f1af5d2fSBarry Smith     s5  = b[4+idx];
17524e2b4712SSatish Balay     while (nz--) {
17534e2b4712SSatish Balay       idx   = 5*(*vi++);
1754f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1755f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1756f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1757f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1758f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1759f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1760f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
17614e2b4712SSatish Balay       v += 25;
17624e2b4712SSatish Balay     }
17634e2b4712SSatish Balay     idx = 5*i;
1764f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1765f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
17664e2b4712SSatish Balay   }
17674e2b4712SSatish Balay   /* backward solve the upper triangular */
17684e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17694e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
17704e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17714e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17724e2b4712SSatish Balay     idt  = 5*i;
1773f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1774f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
17754e2b4712SSatish Balay     while (nz--) {
17764e2b4712SSatish Balay       idx   = 5*(*vi++);
1777f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1778f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1779f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1780f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1781f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1782f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1783f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
17844e2b4712SSatish Balay       v += 25;
17854e2b4712SSatish Balay     }
17864e2b4712SSatish Balay     idc = 5*(*c--);
17874e2b4712SSatish Balay     v   = aa + 25*diag[i];
1788f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1789f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1790f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1791f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1792f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1793f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1794f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1795f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1796f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1797f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
17984e2b4712SSatish Balay   }
17994e2b4712SSatish Balay 
18004e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18014e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1802d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1804dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
18054e2b4712SSatish Balay   PetscFunctionReturn(0);
18064e2b4712SSatish Balay }
18074e2b4712SSatish Balay 
18084a2ae208SSatish Balay #undef __FUNCT__
18094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1810dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
181115091d37SBarry Smith {
181215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1813690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1814dfbe8321SBarry Smith   PetscErrorCode    ierr;
1815690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1816d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1817d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1818d9fead3dSBarry Smith   const PetscScalar *b;
181915091d37SBarry Smith 
182015091d37SBarry Smith   PetscFunctionBegin;
1821d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
182315091d37SBarry Smith   /* forward solve the lower triangular */
182415091d37SBarry Smith   idx    = 0;
182515091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
182615091d37SBarry Smith   for (i=1; i<n; i++) {
182715091d37SBarry Smith     v     =  aa + 25*ai[i];
182815091d37SBarry Smith     vi    =  aj + ai[i];
182915091d37SBarry Smith     nz    =  diag[i] - ai[i];
183015091d37SBarry Smith     idx   =  5*i;
1831f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
183215091d37SBarry Smith     while (nz--) {
183315091d37SBarry Smith       jdx   = 5*(*vi++);
183415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1835f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1836f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1837f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1838f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1839f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
184015091d37SBarry Smith       v    += 25;
184115091d37SBarry Smith     }
1842f1af5d2fSBarry Smith     x[idx]   = s1;
1843f1af5d2fSBarry Smith     x[1+idx] = s2;
1844f1af5d2fSBarry Smith     x[2+idx] = s3;
1845f1af5d2fSBarry Smith     x[3+idx] = s4;
1846f1af5d2fSBarry Smith     x[4+idx] = s5;
184715091d37SBarry Smith   }
184815091d37SBarry Smith   /* backward solve the upper triangular */
184915091d37SBarry Smith   for (i=n-1; i>=0; i--){
185015091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
185115091d37SBarry Smith     vi   = aj + diag[i] + 1;
185215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
185315091d37SBarry Smith     idt  = 5*i;
1854f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1855f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
185615091d37SBarry Smith     while (nz--) {
185715091d37SBarry Smith       idx   = 5*(*vi++);
185815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1859f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1860f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1861f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1862f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1863f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
186415091d37SBarry Smith       v    += 25;
186515091d37SBarry Smith     }
186615091d37SBarry Smith     v        = aa + 25*diag[i];
1867f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1868f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1869f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1870f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1871f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
187215091d37SBarry Smith   }
187315091d37SBarry Smith 
1874d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18751ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1876dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
187715091d37SBarry Smith   PetscFunctionReturn(0);
187815091d37SBarry Smith }
187915091d37SBarry Smith 
18804a2ae208SSatish Balay #undef __FUNCT__
1881*cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
1882*cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1883*cee9d6f2SShri Abhyankar {
1884*cee9d6f2SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1885*cee9d6f2SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1886*cee9d6f2SShri Abhyankar   PetscErrorCode    ierr;
1887*cee9d6f2SShri Abhyankar   PetscInt          jdx;
1888*cee9d6f2SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1889*cee9d6f2SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1890*cee9d6f2SShri Abhyankar   const PetscScalar *b;
1891*cee9d6f2SShri Abhyankar 
1892*cee9d6f2SShri Abhyankar   PetscFunctionBegin;
1893*cee9d6f2SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1894*cee9d6f2SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1895*cee9d6f2SShri Abhyankar   /* forward solve the lower triangular */
1896*cee9d6f2SShri Abhyankar   idx    = 0;
1897*cee9d6f2SShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1898*cee9d6f2SShri Abhyankar   for (i=1; i<n; i++) {
1899*cee9d6f2SShri Abhyankar     v   = aa + 25*ai[i];
1900*cee9d6f2SShri Abhyankar     vi  = aj + ai[i];
1901*cee9d6f2SShri Abhyankar     nz  = ai[i+1] - ai[i];
1902*cee9d6f2SShri Abhyankar     idx = 5*i;
1903*cee9d6f2SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1904*cee9d6f2SShri Abhyankar     while (nz--) {
1905*cee9d6f2SShri Abhyankar       jdx   = 5*(*vi++);
1906*cee9d6f2SShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1907*cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1908*cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1909*cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1910*cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1911*cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1912*cee9d6f2SShri Abhyankar       v    += 25;
1913*cee9d6f2SShri Abhyankar     }
1914*cee9d6f2SShri Abhyankar     x[idx]   = s1;
1915*cee9d6f2SShri Abhyankar     x[1+idx] = s2;
1916*cee9d6f2SShri Abhyankar     x[2+idx] = s3;
1917*cee9d6f2SShri Abhyankar     x[3+idx] = s4;
1918*cee9d6f2SShri Abhyankar     x[4+idx] = s5;
1919*cee9d6f2SShri Abhyankar   }
1920*cee9d6f2SShri Abhyankar 
1921*cee9d6f2SShri Abhyankar   /* backward solve the upper triangular */
1922*cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
1923*cee9d6f2SShri Abhyankar     v   = aa + 25*ai[2*n-i];
1924*cee9d6f2SShri Abhyankar     vi  = aj + ai[2*n-i];
1925*cee9d6f2SShri Abhyankar     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1926*cee9d6f2SShri Abhyankar     idt = 5*i;
1927*cee9d6f2SShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
1928*cee9d6f2SShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1929*cee9d6f2SShri Abhyankar     while (nz--) {
1930*cee9d6f2SShri Abhyankar       idx   = 5*(*vi++);
1931*cee9d6f2SShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1932*cee9d6f2SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1933*cee9d6f2SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1934*cee9d6f2SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1935*cee9d6f2SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1936*cee9d6f2SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1937*cee9d6f2SShri Abhyankar       v    += 25;
1938*cee9d6f2SShri Abhyankar     }
1939*cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
1940*cee9d6f2SShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1941*cee9d6f2SShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1942*cee9d6f2SShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1943*cee9d6f2SShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1944*cee9d6f2SShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1945*cee9d6f2SShri Abhyankar   }
1946*cee9d6f2SShri Abhyankar 
1947*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1948*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1949*cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1950*cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
1951*cee9d6f2SShri Abhyankar }
1952*cee9d6f2SShri Abhyankar 
1953*cee9d6f2SShri Abhyankar #undef __FUNCT__
19544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1955dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
19564e2b4712SSatish Balay {
19574e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
19584e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
19596849ba73SBarry Smith   PetscErrorCode    ierr;
19605d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
19615d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1962d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1963d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1964d9fead3dSBarry Smith   const PetscScalar *b;
19654e2b4712SSatish Balay 
19664e2b4712SSatish Balay   PetscFunctionBegin;
1967d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19681ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1969f1af5d2fSBarry Smith   t  = a->solve_work;
19704e2b4712SSatish Balay 
19714e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
19724e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
19734e2b4712SSatish Balay 
19744e2b4712SSatish Balay   /* forward solve the lower triangular */
19754e2b4712SSatish Balay   idx    = 4*(*r++);
1976f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1977f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
19784e2b4712SSatish Balay   for (i=1; i<n; i++) {
19794e2b4712SSatish Balay     v     = aa + 16*ai[i];
19804e2b4712SSatish Balay     vi    = aj + ai[i];
19814e2b4712SSatish Balay     nz    = diag[i] - ai[i];
19824e2b4712SSatish Balay     idx   = 4*(*r++);
1983f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
19844e2b4712SSatish Balay     while (nz--) {
19854e2b4712SSatish Balay       idx   = 4*(*vi++);
1986f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1987f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1988f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1989f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1990f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
19914e2b4712SSatish Balay       v    += 16;
19924e2b4712SSatish Balay     }
19934e2b4712SSatish Balay     idx        = 4*i;
1994f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1995f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
19964e2b4712SSatish Balay   }
19974e2b4712SSatish Balay   /* backward solve the upper triangular */
19984e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
19994e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
20004e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
20014e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
20024e2b4712SSatish Balay     idt  = 4*i;
2003f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2004f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
20054e2b4712SSatish Balay     while (nz--) {
20064e2b4712SSatish Balay       idx   = 4*(*vi++);
2007f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2008f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2009f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2010f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2011f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2012f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
20134e2b4712SSatish Balay       v += 16;
20144e2b4712SSatish Balay     }
20154e2b4712SSatish Balay     idc      = 4*(*c--);
20164e2b4712SSatish Balay     v        = aa + 16*diag[i];
2017f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2018f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2019f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2020f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
20214e2b4712SSatish Balay   }
20224e2b4712SSatish Balay 
20234e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
20244e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2025d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20261ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2027dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
20284e2b4712SSatish Balay   PetscFunctionReturn(0);
20294e2b4712SSatish Balay }
2030f26ec98cSKris Buschelman 
2031f26ec98cSKris Buschelman #undef __FUNCT__
2032f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2033dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2034f26ec98cSKris Buschelman {
2035f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2036f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
20376849ba73SBarry Smith   PetscErrorCode    ierr;
20385d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
20395d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2040d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2041d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2042d9fead3dSBarry Smith   PetscScalar       *x;
2043d9fead3dSBarry Smith   const PetscScalar *b;
2044f26ec98cSKris Buschelman 
2045f26ec98cSKris Buschelman   PetscFunctionBegin;
2046d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20471ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2048f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2049f26ec98cSKris Buschelman 
2050f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2051f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2052f26ec98cSKris Buschelman 
2053f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2054f26ec98cSKris Buschelman   idx    = 4*(*r++);
2055f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2056f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2057f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2058f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2059f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2060f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2061f26ec98cSKris Buschelman     vi    = aj + ai[i];
2062f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2063f26ec98cSKris Buschelman     idx   = 4*(*r++);
2064f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2065f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2066f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2067f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2068f26ec98cSKris Buschelman     while (nz--) {
2069f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2070f26ec98cSKris Buschelman       x1  = t[idx];
2071f26ec98cSKris Buschelman       x2  = t[1+idx];
2072f26ec98cSKris Buschelman       x3  = t[2+idx];
2073f26ec98cSKris Buschelman       x4  = t[3+idx];
2074f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2075f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2076f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2077f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2078f26ec98cSKris Buschelman       v    += 16;
2079f26ec98cSKris Buschelman     }
2080f26ec98cSKris Buschelman     idx        = 4*i;
2081f26ec98cSKris Buschelman     t[idx]   = s1;
2082f26ec98cSKris Buschelman     t[1+idx] = s2;
2083f26ec98cSKris Buschelman     t[2+idx] = s3;
2084f26ec98cSKris Buschelman     t[3+idx] = s4;
2085f26ec98cSKris Buschelman   }
2086f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2087f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2088f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2089f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2090f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2091f26ec98cSKris Buschelman     idt  = 4*i;
2092f26ec98cSKris Buschelman     s1 = t[idt];
2093f26ec98cSKris Buschelman     s2 = t[1+idt];
2094f26ec98cSKris Buschelman     s3 = t[2+idt];
2095f26ec98cSKris Buschelman     s4 = t[3+idt];
2096f26ec98cSKris Buschelman     while (nz--) {
2097f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2098f26ec98cSKris Buschelman       x1  = t[idx];
2099f26ec98cSKris Buschelman       x2  = t[1+idx];
2100f26ec98cSKris Buschelman       x3  = t[2+idx];
2101f26ec98cSKris Buschelman       x4  = t[3+idx];
2102f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2103f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2104f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2105f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
2106f26ec98cSKris Buschelman       v += 16;
2107f26ec98cSKris Buschelman     }
2108f26ec98cSKris Buschelman     idc      = 4*(*c--);
2109f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
2110f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2111f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2112f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2113f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
2114f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2115f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2116f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2117f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2118f26ec98cSKris Buschelman  }
2119f26ec98cSKris Buschelman 
2120f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2121f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2122d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21231ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2124dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2125f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2126f26ec98cSKris Buschelman }
2127f26ec98cSKris Buschelman 
212824c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
212924c233c2SKris Buschelman 
213024c233c2SKris Buschelman #include PETSC_HAVE_SSE
213124c233c2SKris Buschelman 
213224c233c2SKris Buschelman #undef __FUNCT__
213324c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2134dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
213524c233c2SKris Buschelman {
213624c233c2SKris Buschelman   /*
213724c233c2SKris Buschelman      Note: This code uses demotion of double
213824c233c2SKris Buschelman      to float when performing the mixed-mode computation.
213924c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
214024c233c2SKris Buschelman   */
214124c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
214224c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
21436849ba73SBarry Smith   PetscErrorCode ierr;
21445d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
21455d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
214624c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
214787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
214824c233c2SKris Buschelman 
214924c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
215024c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
215124c233c2SKris Buschelman   unsigned long   offset;
215224c233c2SKris Buschelman 
215324c233c2SKris Buschelman   PetscFunctionBegin;
215424c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
215524c233c2SKris Buschelman 
215624c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
215724c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
215824c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
215924c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
216024c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
216124c233c2SKris Buschelman 
21621ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
21631ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
216424c233c2SKris Buschelman     t  = a->solve_work;
216524c233c2SKris Buschelman 
216624c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
216724c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
216824c233c2SKris Buschelman 
216924c233c2SKris Buschelman     /* forward solve the lower triangular */
217024c233c2SKris Buschelman     idx  = 4*(*r++);
217124c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
217224c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
217324c233c2SKris Buschelman     v    =  aa + 16*ai[1];
217424c233c2SKris Buschelman 
217524c233c2SKris Buschelman     for (i=1; i<n;) {
217624c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
217724c233c2SKris Buschelman       vi   =  aj      + ai[i];
217824c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
217924c233c2SKris Buschelman       idx  =  4*(*r++);
218024c233c2SKris Buschelman 
218124c233c2SKris Buschelman       /* Demote sum from double to float */
218224c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
218324c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
218424c233c2SKris Buschelman 
218524c233c2SKris Buschelman       while (nz--) {
218624c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
218724c233c2SKris Buschelman         idx = 4*(*vi++);
218824c233c2SKris Buschelman 
218924c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
219024c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
219124c233c2SKris Buschelman 
219224c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
219324c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
219424c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
219524c233c2SKris Buschelman 
219624c233c2SKris Buschelman           /* First Column */
219724c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
219824c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
219924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
220024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
220124c233c2SKris Buschelman 
220224c233c2SKris Buschelman           /* Second Column */
220324c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
220424c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
220524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
220624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
220724c233c2SKris Buschelman 
220824c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
220924c233c2SKris Buschelman 
221024c233c2SKris Buschelman           /* Third Column */
221124c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
221224c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
221324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
221424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
221524c233c2SKris Buschelman 
221624c233c2SKris Buschelman           /* Fourth Column */
221724c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
221824c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
221924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
222024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
222124c233c2SKris Buschelman         SSE_INLINE_END_2
222224c233c2SKris Buschelman 
222324c233c2SKris Buschelman         v  += 16;
222424c233c2SKris Buschelman       }
222524c233c2SKris Buschelman       idx = 4*i;
222624c233c2SKris Buschelman       v   = aa + 16*ai[++i];
222724c233c2SKris Buschelman       PREFETCH_NTA(v);
222824c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
222924c233c2SKris Buschelman 
223024c233c2SKris Buschelman       /* Promote result from float to double */
223124c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
223224c233c2SKris Buschelman     }
223324c233c2SKris Buschelman     /* backward solve the upper triangular */
223424c233c2SKris Buschelman     idt  = 4*(n-1);
223524c233c2SKris Buschelman     ai16 = 16*diag[n-1];
223624c233c2SKris Buschelman     v    = aa + ai16 + 16;
223724c233c2SKris Buschelman     for (i=n-1; i>=0;){
223824c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
223924c233c2SKris Buschelman       vi = aj + diag[i] + 1;
224024c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
224124c233c2SKris Buschelman 
224224c233c2SKris Buschelman       /* Demote accumulator from double to float */
224324c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
224424c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
224524c233c2SKris Buschelman 
224624c233c2SKris Buschelman       while (nz--) {
224724c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
224824c233c2SKris Buschelman         idx = 4*(*vi++);
224924c233c2SKris Buschelman 
225024c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
225124c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
225224c233c2SKris Buschelman 
225324c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
225424c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
225524c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
225624c233c2SKris Buschelman 
225724c233c2SKris Buschelman           /* First Column */
225824c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
225924c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
226024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
226124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
226224c233c2SKris Buschelman 
226324c233c2SKris Buschelman           /* Second Column */
226424c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
226524c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
226624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
226724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
226824c233c2SKris Buschelman 
226924c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
227024c233c2SKris Buschelman 
227124c233c2SKris Buschelman           /* Third Column */
227224c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
227324c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
227424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
227524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
227624c233c2SKris Buschelman 
227724c233c2SKris Buschelman           /* Fourth Column */
227824c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
227924c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
228024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
228124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
228224c233c2SKris Buschelman         SSE_INLINE_END_2
228324c233c2SKris Buschelman         v  += 16;
228424c233c2SKris Buschelman       }
228524c233c2SKris Buschelman       v    = aa + ai16;
228624c233c2SKris Buschelman       ai16 = 16*diag[--i];
228724c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
228824c233c2SKris Buschelman       /*
228924c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
229024c233c2SKris Buschelman          which was inverted as part of the factorization
229124c233c2SKris Buschelman       */
229224c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
229324c233c2SKris Buschelman         /* First Column */
229424c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
229524c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
229624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
229724c233c2SKris Buschelman 
229824c233c2SKris Buschelman         /* Second Column */
229924c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
230024c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
230124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
230224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
230324c233c2SKris Buschelman 
230424c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
230524c233c2SKris Buschelman 
230624c233c2SKris Buschelman         /* Third Column */
230724c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
230824c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
230924c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
231024c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
231124c233c2SKris Buschelman 
231224c233c2SKris Buschelman         /* Fourth Column */
231324c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
231424c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
231524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
231624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
231724c233c2SKris Buschelman 
231824c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
231924c233c2SKris Buschelman       SSE_INLINE_END_3
232024c233c2SKris Buschelman 
232124c233c2SKris Buschelman       /* Promote solution from float to double */
232224c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
232324c233c2SKris Buschelman 
232424c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
232524c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
232624c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
232724c233c2SKris Buschelman       idc  = 4*(*c--);
232824c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
232924c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
233024c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
233124c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
233224c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
233324c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
233424c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
233524c233c2SKris Buschelman       SSE_INLINE_END_2
233624c233c2SKris Buschelman       v    = aa + ai16 + 16;
233724c233c2SKris Buschelman       idt -= 4;
233824c233c2SKris Buschelman     }
233924c233c2SKris Buschelman 
234024c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
234124c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23421ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
23431ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2344dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
234524c233c2SKris Buschelman   SSE_SCOPE_END;
234624c233c2SKris Buschelman   PetscFunctionReturn(0);
234724c233c2SKris Buschelman }
234824c233c2SKris Buschelman 
234924c233c2SKris Buschelman #endif
23500ef38995SBarry Smith 
23510ef38995SBarry Smith 
23524e2b4712SSatish Balay /*
23534e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
23544e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
23554e2b4712SSatish Balay */
23564a2ae208SSatish Balay #undef __FUNCT__
23574a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2358dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
23594e2b4712SSatish Balay {
23604e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2361356650c2SBarry Smith   PetscInt          n=a->mbs;
2362356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2363dfbe8321SBarry Smith   PetscErrorCode    ierr;
2364356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2365d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2366d9fead3dSBarry Smith   PetscScalar       *x;
2367d9fead3dSBarry Smith   const PetscScalar *b;
23684e2b4712SSatish Balay 
23694e2b4712SSatish Balay   PetscFunctionBegin;
2370d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23724e2b4712SSatish Balay 
2373aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
23742853dc0eSBarry Smith   {
237587828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
23762853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
23772853dc0eSBarry Smith   }
2378aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
23792853dc0eSBarry Smith   {
238087828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
23812853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
23822853dc0eSBarry Smith   }
2383aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
23842853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2385e1293385SBarry Smith #else
238630d4dcafSBarry Smith   {
238787828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2388d9fead3dSBarry Smith     const MatScalar *v;
2389356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2390356650c2SBarry Smith     const PetscInt  *vi;
2391e1293385SBarry Smith 
23924e2b4712SSatish Balay   /* forward solve the lower triangular */
23934e2b4712SSatish Balay   idx    = 0;
2394e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
23954e2b4712SSatish Balay   for (i=1; i<n; i++) {
23964e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
23974e2b4712SSatish Balay     vi    =  aj      + ai[i];
23984e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2399e1293385SBarry Smith     idx   +=  4;
2400f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
24014e2b4712SSatish Balay     while (nz--) {
24024e2b4712SSatish Balay       jdx   = 4*(*vi++);
24034e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2404f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2405f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2406f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2407f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
24084e2b4712SSatish Balay       v    += 16;
24094e2b4712SSatish Balay     }
2410f1af5d2fSBarry Smith     x[idx]   = s1;
2411f1af5d2fSBarry Smith     x[1+idx] = s2;
2412f1af5d2fSBarry Smith     x[2+idx] = s3;
2413f1af5d2fSBarry Smith     x[3+idx] = s4;
24144e2b4712SSatish Balay   }
24154e2b4712SSatish Balay   /* backward solve the upper triangular */
24164e555682SBarry Smith   idt = 4*(n-1);
24174e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
24184e555682SBarry Smith     ai16 = 16*diag[i];
24194e555682SBarry Smith     v    = aa + ai16 + 16;
24204e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
24214e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2422f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2423f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
24244e2b4712SSatish Balay     while (nz--) {
24254e2b4712SSatish Balay       idx   = 4*(*vi++);
24264e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2427f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2428f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2429f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2430f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
24314e2b4712SSatish Balay       v    += 16;
24324e2b4712SSatish Balay     }
24334e555682SBarry Smith     v        = aa + ai16;
2434f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2435f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2436f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2437f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2438329f5518SBarry Smith     idt -= 4;
24394e2b4712SSatish Balay   }
244030d4dcafSBarry Smith   }
2441e1293385SBarry Smith #endif
24424e2b4712SSatish Balay 
2443d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2445dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
24464e2b4712SSatish Balay   PetscFunctionReturn(0);
24474e2b4712SSatish Balay }
24484e2b4712SSatish Balay 
2449f26ec98cSKris Buschelman #undef __FUNCT__
2450*cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
2451*cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2452*cee9d6f2SShri Abhyankar {
2453*cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2454*cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2455*cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
2456*cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
2457*cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
2458*cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
2459*cee9d6f2SShri Abhyankar     PetscScalar       *x;
2460*cee9d6f2SShri Abhyankar     const PetscScalar *b;
2461*cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
2462*cee9d6f2SShri Abhyankar 
2463*cee9d6f2SShri Abhyankar     PetscFunctionBegin;
2464*cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2465*cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2466*cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
2467*cee9d6f2SShri Abhyankar     idx    = 0;
2468*cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
2469*cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
2470*cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
2471*cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
2472*cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
2473*cee9d6f2SShri Abhyankar       idx   = bs*i;
2474*cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2475*cee9d6f2SShri Abhyankar        while (nz--) {
2476*cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
2477*cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
2478*cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2479*cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2480*cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2481*cee9d6f2SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2482*cee9d6f2SShri Abhyankar 
2483*cee9d6f2SShri Abhyankar           v   +=  bs2;
2484*cee9d6f2SShri Abhyankar         }
2485*cee9d6f2SShri Abhyankar 
2486*cee9d6f2SShri Abhyankar        x[idx]   = s1;
2487*cee9d6f2SShri Abhyankar        x[1+idx] = s2;
2488*cee9d6f2SShri Abhyankar        x[2+idx] = s3;
2489*cee9d6f2SShri Abhyankar        x[3+idx] = s4;
2490*cee9d6f2SShri Abhyankar     }
2491*cee9d6f2SShri Abhyankar 
2492*cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
2493*cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
2494*cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
2495*cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
2496*cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
2497*cee9d6f2SShri Abhyankar      idt = bs*i;
2498*cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
2499*cee9d6f2SShri Abhyankar 
2500*cee9d6f2SShri Abhyankar     while (nz--) {
2501*cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
2502*cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
2503*cee9d6f2SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
2504*cee9d6f2SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
2505*cee9d6f2SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2506*cee9d6f2SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2507*cee9d6f2SShri Abhyankar 
2508*cee9d6f2SShri Abhyankar         v   +=  bs2;
2509*cee9d6f2SShri Abhyankar     }
2510*cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
2511*cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
2512*cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
2513*cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2514*cee9d6f2SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2515*cee9d6f2SShri Abhyankar 
2516*cee9d6f2SShri Abhyankar   }
2517*cee9d6f2SShri Abhyankar 
2518*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2519*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2520*cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2521*cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
2522*cee9d6f2SShri Abhyankar }
2523*cee9d6f2SShri Abhyankar 
2524*cee9d6f2SShri Abhyankar 
2525*cee9d6f2SShri Abhyankar 
2526*cee9d6f2SShri Abhyankar #undef __FUNCT__
2527f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2528dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2529f26ec98cSKris Buschelman {
2530f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2531690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2532dfbe8321SBarry Smith   PetscErrorCode ierr;
2533690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2534f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2535f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2536f26ec98cSKris Buschelman 
2537f26ec98cSKris Buschelman   PetscFunctionBegin;
25381ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25391ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2540f26ec98cSKris Buschelman 
2541f26ec98cSKris Buschelman   {
2542f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2543f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2544690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2545f26ec98cSKris Buschelman 
2546f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2547f26ec98cSKris Buschelman     idx  = 0;
2548f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2549f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2550f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2551f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2552f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2553f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2554f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2555f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2556f26ec98cSKris Buschelman       idx   +=  4;
2557f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2558f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2559f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2560f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2561f26ec98cSKris Buschelman       while (nz--) {
2562f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2563f26ec98cSKris Buschelman         x1  = t[jdx];
2564f26ec98cSKris Buschelman         x2  = t[1+jdx];
2565f26ec98cSKris Buschelman         x3  = t[2+jdx];
2566f26ec98cSKris Buschelman         x4  = t[3+jdx];
2567f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2568f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2569f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2570f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2571f26ec98cSKris Buschelman         v    += 16;
2572f26ec98cSKris Buschelman       }
2573f26ec98cSKris Buschelman       t[idx]   = s1;
2574f26ec98cSKris Buschelman       t[1+idx] = s2;
2575f26ec98cSKris Buschelman       t[2+idx] = s3;
2576f26ec98cSKris Buschelman       t[3+idx] = s4;
2577f26ec98cSKris Buschelman     }
2578f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2579f26ec98cSKris Buschelman     idt = 4*(n-1);
2580f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2581f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2582f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2583f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2584f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2585f26ec98cSKris Buschelman       s1   = t[idt];
2586f26ec98cSKris Buschelman       s2   = t[1+idt];
2587f26ec98cSKris Buschelman       s3   = t[2+idt];
2588f26ec98cSKris Buschelman       s4   = t[3+idt];
2589f26ec98cSKris Buschelman       while (nz--) {
2590f26ec98cSKris Buschelman         idx = 4*(*vi++);
2591f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2592f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2593f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2594f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2595f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2596f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2597f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2598f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2599f26ec98cSKris Buschelman         v    += 16;
2600f26ec98cSKris Buschelman       }
2601f26ec98cSKris Buschelman       v        = aa + ai16;
2602f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2603f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2604f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2605f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2606f26ec98cSKris Buschelman       idt -= 4;
2607f26ec98cSKris Buschelman     }
2608f26ec98cSKris Buschelman   }
2609f26ec98cSKris Buschelman 
26101ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
26111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2612dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2613f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2614f26ec98cSKris Buschelman }
2615f26ec98cSKris Buschelman 
26163660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
26173660e330SKris Buschelman 
26183660e330SKris Buschelman #include PETSC_HAVE_SSE
26193660e330SKris Buschelman #undef __FUNCT__
26207cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2621dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
26223660e330SKris Buschelman {
26233660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
26242aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2625dfbe8321SBarry Smith   PetscErrorCode ierr;
2626dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
26273660e330SKris Buschelman   MatScalar      *aa=a->a;
262887828ca2SBarry Smith   PetscScalar    *x,*b;
26293660e330SKris Buschelman 
26303660e330SKris Buschelman   PetscFunctionBegin;
26313660e330SKris Buschelman   SSE_SCOPE_BEGIN;
26323660e330SKris Buschelman   /*
26333660e330SKris Buschelman      Note: This code currently uses demotion of double
26343660e330SKris Buschelman      to float when performing the mixed-mode computation.
26353660e330SKris Buschelman      This may not be numerically reasonable for all applications.
26363660e330SKris Buschelman   */
26373660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
26383660e330SKris Buschelman 
26391ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26413660e330SKris Buschelman   {
2642eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2643eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
26442aa5897fSKris Buschelman     int            nz,i,idt,ai16;
26452aa5897fSKris Buschelman     unsigned int   jdx,idx;
26462aa5897fSKris Buschelman     unsigned short *vi;
2647eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
26483660e330SKris Buschelman 
2649eb05f457SKris Buschelman     /* First block is the identity. */
26503660e330SKris Buschelman     idx  = 0;
2651eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
26522aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
26533660e330SKris Buschelman 
26543660e330SKris Buschelman     for (i=1; i<n;) {
26553660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
26563660e330SKris Buschelman       vi   =  aj      + ai[i];
26573660e330SKris Buschelman       nz   =  diag[i] - ai[i];
26583660e330SKris Buschelman       idx +=  4;
26593660e330SKris Buschelman 
2660eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2661eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2662eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
26633660e330SKris Buschelman 
26643660e330SKris Buschelman       while (nz--) {
26653660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
26662aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
26673660e330SKris Buschelman 
26683660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2669eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
26703660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26713660e330SKris Buschelman 
26723660e330SKris Buschelman           /* First Column */
26733660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26743660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26753660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26763660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26773660e330SKris Buschelman 
26783660e330SKris Buschelman           /* Second Column */
26793660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26803660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26813660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26823660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26833660e330SKris Buschelman 
26843660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26853660e330SKris Buschelman 
26863660e330SKris Buschelman           /* Third Column */
26873660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26883660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26893660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26903660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26913660e330SKris Buschelman 
26923660e330SKris Buschelman           /* Fourth Column */
26933660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26943660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26953660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26963660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26973660e330SKris Buschelman         SSE_INLINE_END_2
26983660e330SKris Buschelman 
26993660e330SKris Buschelman         v  += 16;
27003660e330SKris Buschelman       }
27013660e330SKris Buschelman       v    =  aa + 16*ai[++i];
27023660e330SKris Buschelman       PREFETCH_NTA(v);
2703eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
27043660e330SKris Buschelman     }
2705eb05f457SKris Buschelman 
2706eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2707eb05f457SKris Buschelman 
27083660e330SKris Buschelman     idt  = 4*(n-1);
27093660e330SKris Buschelman     ai16 = 16*diag[n-1];
27103660e330SKris Buschelman     v    = aa + ai16 + 16;
27113660e330SKris Buschelman     for (i=n-1; i>=0;){
27123660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
27133660e330SKris Buschelman       vi = aj + diag[i] + 1;
27143660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
27153660e330SKris Buschelman 
2716eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
27173660e330SKris Buschelman 
27183660e330SKris Buschelman       while (nz--) {
27193660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
27202aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
27213660e330SKris Buschelman 
27223660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2723eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
27243660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
27253660e330SKris Buschelman 
27263660e330SKris Buschelman           /* First Column */
27273660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
27283660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
27293660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
27303660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
27313660e330SKris Buschelman 
27323660e330SKris Buschelman           /* Second Column */
27333660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
27343660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
27353660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
27363660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
27373660e330SKris Buschelman 
27383660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
27393660e330SKris Buschelman 
27403660e330SKris Buschelman           /* Third Column */
27413660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
27423660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
27433660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
27443660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
27453660e330SKris Buschelman 
27463660e330SKris Buschelman           /* Fourth Column */
27473660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
27483660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
27493660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
27503660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
27513660e330SKris Buschelman         SSE_INLINE_END_2
27523660e330SKris Buschelman         v  += 16;
27533660e330SKris Buschelman       }
27543660e330SKris Buschelman       v    = aa + ai16;
27553660e330SKris Buschelman       ai16 = 16*diag[--i];
27563660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
27573660e330SKris Buschelman       /*
27583660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
27593660e330SKris Buschelman          which was inverted as part of the factorization
27603660e330SKris Buschelman       */
2761eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
27623660e330SKris Buschelman         /* First Column */
27633660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
27643660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
27653660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
27663660e330SKris Buschelman 
27673660e330SKris Buschelman         /* Second Column */
27683660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
27693660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
27703660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
27713660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
27723660e330SKris Buschelman 
27733660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
27743660e330SKris Buschelman 
27753660e330SKris Buschelman         /* Third Column */
27763660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
27773660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
27783660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
27793660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
27803660e330SKris Buschelman 
27813660e330SKris Buschelman         /* Fourth Column */
27823660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
27833660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
27843660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
27853660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
27863660e330SKris Buschelman 
27873660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
27883660e330SKris Buschelman       SSE_INLINE_END_3
27893660e330SKris Buschelman 
27903660e330SKris Buschelman       v    = aa + ai16 + 16;
27913660e330SKris Buschelman       idt -= 4;
27923660e330SKris Buschelman     }
2793eb05f457SKris Buschelman 
2794eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2795eb05f457SKris Buschelman     idt = 4*(n-1);
2796eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2797eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2798eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2799eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2800eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2801eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2802eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2803eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2804eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
280554693613SKris Buschelman       idt -= 4;
28063660e330SKris Buschelman     }
2807eb05f457SKris Buschelman 
2808eb05f457SKris Buschelman   } /* End of artificial scope. */
28091ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
28101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2811dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
28123660e330SKris Buschelman   SSE_SCOPE_END;
28133660e330SKris Buschelman   PetscFunctionReturn(0);
28143660e330SKris Buschelman }
28153660e330SKris Buschelman 
28167cf1b8d3SKris Buschelman #undef __FUNCT__
28177cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2818dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
28197cf1b8d3SKris Buschelman {
28207cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
28217cf1b8d3SKris Buschelman   int            *aj=a->j;
2822dfbe8321SBarry Smith   PetscErrorCode ierr;
2823dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
28247cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
28257cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
28267cf1b8d3SKris Buschelman 
28277cf1b8d3SKris Buschelman   PetscFunctionBegin;
28287cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
28297cf1b8d3SKris Buschelman   /*
28307cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
28317cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
28327cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
28337cf1b8d3SKris Buschelman   */
28347cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
28357cf1b8d3SKris Buschelman 
28361ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
28371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28387cf1b8d3SKris Buschelman   {
28397cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
28407cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
28417cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
28427cf1b8d3SKris Buschelman     int       jdx,idx;
28437cf1b8d3SKris Buschelman     int       *vi;
28447cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
28457cf1b8d3SKris Buschelman 
28467cf1b8d3SKris Buschelman     /* First block is the identity. */
28477cf1b8d3SKris Buschelman     idx  = 0;
28487cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
28497cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
28507cf1b8d3SKris Buschelman 
28517cf1b8d3SKris Buschelman     for (i=1; i<n;) {
28527cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
28537cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
28547cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
28557cf1b8d3SKris Buschelman       idx +=  4;
28567cf1b8d3SKris Buschelman 
28577cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
28587cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
28597cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
28607cf1b8d3SKris Buschelman 
28617cf1b8d3SKris Buschelman       while (nz--) {
28627cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
28637cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
28647cf1b8d3SKris Buschelman /*          jdx = *vi++; */
28657cf1b8d3SKris Buschelman 
28667cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
28677cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
28687cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
28697cf1b8d3SKris Buschelman 
28707cf1b8d3SKris Buschelman           /* First Column */
28717cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
28727cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
28737cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
28747cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
28757cf1b8d3SKris Buschelman 
28767cf1b8d3SKris Buschelman           /* Second Column */
28777cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
28787cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
28797cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
28807cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
28817cf1b8d3SKris Buschelman 
28827cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
28837cf1b8d3SKris Buschelman 
28847cf1b8d3SKris Buschelman           /* Third Column */
28857cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
28867cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
28877cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
28887cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
28897cf1b8d3SKris Buschelman 
28907cf1b8d3SKris Buschelman           /* Fourth Column */
28917cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
28927cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
28937cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
28947cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
28957cf1b8d3SKris Buschelman         SSE_INLINE_END_2
28967cf1b8d3SKris Buschelman 
28977cf1b8d3SKris Buschelman         v  += 16;
28987cf1b8d3SKris Buschelman       }
28997cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
29007cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
29017cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
29027cf1b8d3SKris Buschelman     }
29037cf1b8d3SKris Buschelman 
29047cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
29057cf1b8d3SKris Buschelman 
29067cf1b8d3SKris Buschelman     idt  = 4*(n-1);
29077cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
29087cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
29097cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
29107cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
29117cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
29127cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
29137cf1b8d3SKris Buschelman 
29147cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
29157cf1b8d3SKris Buschelman 
29167cf1b8d3SKris Buschelman       while (nz--) {
29177cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
29187cf1b8d3SKris Buschelman         idx = 4*(*vi++);
29197cf1b8d3SKris Buschelman /*          idx = *vi++; */
29207cf1b8d3SKris Buschelman 
29217cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
29227cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
29237cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
29247cf1b8d3SKris Buschelman 
29257cf1b8d3SKris Buschelman           /* First Column */
29267cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
29277cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
29287cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
29297cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
29307cf1b8d3SKris Buschelman 
29317cf1b8d3SKris Buschelman           /* Second Column */
29327cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
29337cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
29347cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
29357cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
29367cf1b8d3SKris Buschelman 
29377cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
29387cf1b8d3SKris Buschelman 
29397cf1b8d3SKris Buschelman           /* Third Column */
29407cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
29417cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
29427cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
29437cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
29447cf1b8d3SKris Buschelman 
29457cf1b8d3SKris Buschelman           /* Fourth Column */
29467cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
29477cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
29487cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
29497cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
29507cf1b8d3SKris Buschelman         SSE_INLINE_END_2
29517cf1b8d3SKris Buschelman         v  += 16;
29527cf1b8d3SKris Buschelman       }
29537cf1b8d3SKris Buschelman       v    = aa + ai16;
29547cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
29557cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
29567cf1b8d3SKris Buschelman       /*
29577cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
29587cf1b8d3SKris Buschelman          which was inverted as part of the factorization
29597cf1b8d3SKris Buschelman       */
29607cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
29617cf1b8d3SKris Buschelman         /* First Column */
29627cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
29637cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
29647cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
29657cf1b8d3SKris Buschelman 
29667cf1b8d3SKris Buschelman         /* Second Column */
29677cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
29687cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
29697cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
29707cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
29717cf1b8d3SKris Buschelman 
29727cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
29737cf1b8d3SKris Buschelman 
29747cf1b8d3SKris Buschelman         /* Third Column */
29757cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
29767cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
29777cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
29787cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
29797cf1b8d3SKris Buschelman 
29807cf1b8d3SKris Buschelman         /* Fourth Column */
29817cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
29827cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
29837cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
29847cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
29857cf1b8d3SKris Buschelman 
29867cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
29877cf1b8d3SKris Buschelman       SSE_INLINE_END_3
29887cf1b8d3SKris Buschelman 
29897cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
29907cf1b8d3SKris Buschelman       idt -= 4;
29917cf1b8d3SKris Buschelman     }
29927cf1b8d3SKris Buschelman 
29937cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
29947cf1b8d3SKris Buschelman     idt = 4*(n-1);
29957cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
29967cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
29977cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
29987cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
29997cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
30007cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
30017cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
30027cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
30037cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
30047cf1b8d3SKris Buschelman       idt -= 4;
30057cf1b8d3SKris Buschelman     }
30067cf1b8d3SKris Buschelman 
30077cf1b8d3SKris Buschelman   } /* End of artificial scope. */
30081ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30091ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3010dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
30117cf1b8d3SKris Buschelman   SSE_SCOPE_END;
30127cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
30137cf1b8d3SKris Buschelman }
30147cf1b8d3SKris Buschelman 
30153660e330SKris Buschelman #endif
30164a2ae208SSatish Balay #undef __FUNCT__
30174a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3018dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
30194e2b4712SSatish Balay {
30204e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
30214e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
30226849ba73SBarry Smith   PetscErrorCode    ierr;
30235d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
30245d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3025d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3026d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3027d9fead3dSBarry Smith   const PetscScalar *b;
30284e2b4712SSatish Balay 
30294e2b4712SSatish Balay   PetscFunctionBegin;
3030d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3032f1af5d2fSBarry Smith   t  = a->solve_work;
30334e2b4712SSatish Balay 
30344e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30354e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
30364e2b4712SSatish Balay 
30374e2b4712SSatish Balay   /* forward solve the lower triangular */
30384e2b4712SSatish Balay   idx    = 3*(*r++);
3039f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
30404e2b4712SSatish Balay   for (i=1; i<n; i++) {
30414e2b4712SSatish Balay     v     = aa + 9*ai[i];
30424e2b4712SSatish Balay     vi    = aj + ai[i];
30434e2b4712SSatish Balay     nz    = diag[i] - ai[i];
30444e2b4712SSatish Balay     idx   = 3*(*r++);
3045f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
30464e2b4712SSatish Balay     while (nz--) {
30474e2b4712SSatish Balay       idx   = 3*(*vi++);
3048f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3049f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3050f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3051f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
30524e2b4712SSatish Balay       v += 9;
30534e2b4712SSatish Balay     }
30544e2b4712SSatish Balay     idx = 3*i;
3055f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
30564e2b4712SSatish Balay   }
30574e2b4712SSatish Balay   /* backward solve the upper triangular */
30584e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30594e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
30604e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30614e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
30624e2b4712SSatish Balay     idt  = 3*i;
3063f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
30644e2b4712SSatish Balay     while (nz--) {
30654e2b4712SSatish Balay       idx   = 3*(*vi++);
3066f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3067f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3068f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3069f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
30704e2b4712SSatish Balay       v += 9;
30714e2b4712SSatish Balay     }
30724e2b4712SSatish Balay     idc = 3*(*c--);
30734e2b4712SSatish Balay     v   = aa + 9*diag[i];
3074f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3075f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3076f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
30774e2b4712SSatish Balay   }
30784e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30794e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3080d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3082dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
30834e2b4712SSatish Balay   PetscFunctionReturn(0);
30844e2b4712SSatish Balay }
30854e2b4712SSatish Balay 
308615091d37SBarry Smith /*
308715091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
308815091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
308915091d37SBarry Smith */
30904a2ae208SSatish Balay #undef __FUNCT__
30914a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
3092dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
309315091d37SBarry Smith {
309415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3095690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3096dfbe8321SBarry Smith   PetscErrorCode    ierr;
3097690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3098d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3099d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
3100d9fead3dSBarry Smith   const PetscScalar *b;
3101690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
310215091d37SBarry Smith 
310315091d37SBarry Smith   PetscFunctionBegin;
3104d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31051ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
310615091d37SBarry Smith 
310715091d37SBarry Smith   /* forward solve the lower triangular */
310815091d37SBarry Smith   idx    = 0;
310915091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
311015091d37SBarry Smith   for (i=1; i<n; i++) {
311115091d37SBarry Smith     v     =  aa      + 9*ai[i];
311215091d37SBarry Smith     vi    =  aj      + ai[i];
311315091d37SBarry Smith     nz    =  diag[i] - ai[i];
311415091d37SBarry Smith     idx   +=  3;
3115f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
311615091d37SBarry Smith     while (nz--) {
311715091d37SBarry Smith       jdx   = 3*(*vi++);
311815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
3119f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3120f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3121f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
312215091d37SBarry Smith       v    += 9;
312315091d37SBarry Smith     }
3124f1af5d2fSBarry Smith     x[idx]   = s1;
3125f1af5d2fSBarry Smith     x[1+idx] = s2;
3126f1af5d2fSBarry Smith     x[2+idx] = s3;
312715091d37SBarry Smith   }
312815091d37SBarry Smith   /* backward solve the upper triangular */
312915091d37SBarry Smith   for (i=n-1; i>=0; i--){
313015091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
313115091d37SBarry Smith     vi   = aj + diag[i] + 1;
313215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
313315091d37SBarry Smith     idt  = 3*i;
3134f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3135f1af5d2fSBarry Smith     s3 = x[2+idt];
313615091d37SBarry Smith     while (nz--) {
313715091d37SBarry Smith       idx   = 3*(*vi++);
313815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
3139f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3140f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3141f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
314215091d37SBarry Smith       v    += 9;
314315091d37SBarry Smith     }
314415091d37SBarry Smith     v        = aa +  9*diag[i];
3145f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3146f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3147f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
314815091d37SBarry Smith   }
314915091d37SBarry Smith 
3150d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3152dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
315315091d37SBarry Smith   PetscFunctionReturn(0);
315415091d37SBarry Smith }
315515091d37SBarry Smith 
31564a2ae208SSatish Balay #undef __FUNCT__
3157*cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
3158*cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3159*cee9d6f2SShri Abhyankar {
3160*cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3161*cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
3162*cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3163*cee9d6f2SShri Abhyankar     PetscInt          idx,jdx,idt;
3164*cee9d6f2SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3165*cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3166*cee9d6f2SShri Abhyankar     PetscScalar       *x;
3167*cee9d6f2SShri Abhyankar     const PetscScalar *b;
3168*cee9d6f2SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
3169*cee9d6f2SShri Abhyankar 
3170*cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3171*cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3172*cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3173*cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3174*cee9d6f2SShri Abhyankar     idx    = 0;
3175*cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
3176*cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3177*cee9d6f2SShri Abhyankar        v    = aa + bs2*ai[i];
3178*cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3179*cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3180*cee9d6f2SShri Abhyankar       idx   = bs*i;
3181*cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
3182*cee9d6f2SShri Abhyankar        while (nz--) {
3183*cee9d6f2SShri Abhyankar           jdx   = bs*(*vi++);
3184*cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
3185*cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3186*cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3187*cee9d6f2SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3188*cee9d6f2SShri Abhyankar 
3189*cee9d6f2SShri Abhyankar           v   +=  bs2;
3190*cee9d6f2SShri Abhyankar         }
3191*cee9d6f2SShri Abhyankar 
3192*cee9d6f2SShri Abhyankar        x[idx]   = s1;
3193*cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3194*cee9d6f2SShri Abhyankar        x[2+idx] = s3;
3195*cee9d6f2SShri Abhyankar     }
3196*cee9d6f2SShri Abhyankar 
3197*cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3198*cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3199*cee9d6f2SShri Abhyankar      v   = aa + bs2*ai[2*n-i];
3200*cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3201*cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3202*cee9d6f2SShri Abhyankar      idt = bs*i;
3203*cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
3204*cee9d6f2SShri Abhyankar 
3205*cee9d6f2SShri Abhyankar     while (nz--) {
3206*cee9d6f2SShri Abhyankar       idx   = bs*(*vi++);
3207*cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
3208*cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3209*cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3210*cee9d6f2SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
3211*cee9d6f2SShri Abhyankar 
3212*cee9d6f2SShri Abhyankar         v   +=  bs2;
3213*cee9d6f2SShri Abhyankar     }
3214*cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3215*cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3216*cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3217*cee9d6f2SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
3218*cee9d6f2SShri Abhyankar 
3219*cee9d6f2SShri Abhyankar   }
3220*cee9d6f2SShri Abhyankar 
3221*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3222*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3223*cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3224*cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3225*cee9d6f2SShri Abhyankar }
3226*cee9d6f2SShri Abhyankar 
3227*cee9d6f2SShri Abhyankar #undef __FUNCT__
32284a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
3229dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
32304e2b4712SSatish Balay {
32314e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
32324e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
32336849ba73SBarry Smith   PetscErrorCode    ierr;
32345d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
32355d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3236d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3237d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
3238d9fead3dSBarry Smith   const PetscScalar *b;
32394e2b4712SSatish Balay 
32404e2b4712SSatish Balay   PetscFunctionBegin;
3241d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32421ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3243f1af5d2fSBarry Smith   t  = a->solve_work;
32444e2b4712SSatish Balay 
32454e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
32464e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
32474e2b4712SSatish Balay 
32484e2b4712SSatish Balay   /* forward solve the lower triangular */
32494e2b4712SSatish Balay   idx    = 2*(*r++);
3250f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
32514e2b4712SSatish Balay   for (i=1; i<n; i++) {
32524e2b4712SSatish Balay     v     = aa + 4*ai[i];
32534e2b4712SSatish Balay     vi    = aj + ai[i];
32544e2b4712SSatish Balay     nz    = diag[i] - ai[i];
32554e2b4712SSatish Balay     idx   = 2*(*r++);
3256f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
32574e2b4712SSatish Balay     while (nz--) {
32584e2b4712SSatish Balay       idx   = 2*(*vi++);
3259f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3260f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3261f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
32624e2b4712SSatish Balay       v += 4;
32634e2b4712SSatish Balay     }
32644e2b4712SSatish Balay     idx = 2*i;
3265f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
32664e2b4712SSatish Balay   }
32674e2b4712SSatish Balay   /* backward solve the upper triangular */
32684e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
32694e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
32704e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
32714e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
32724e2b4712SSatish Balay     idt  = 2*i;
3273f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
32744e2b4712SSatish Balay     while (nz--) {
32754e2b4712SSatish Balay       idx   = 2*(*vi++);
3276f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3277f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3278f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
32794e2b4712SSatish Balay       v += 4;
32804e2b4712SSatish Balay     }
32814e2b4712SSatish Balay     idc = 2*(*c--);
32824e2b4712SSatish Balay     v   = aa + 4*diag[i];
3283f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3284f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
32854e2b4712SSatish Balay   }
32864e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
32874e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3288d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3290dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
32914e2b4712SSatish Balay   PetscFunctionReturn(0);
32924e2b4712SSatish Balay }
32934e2b4712SSatish Balay 
329415091d37SBarry Smith /*
329515091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
329615091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
329715091d37SBarry Smith */
32984a2ae208SSatish Balay #undef __FUNCT__
32994a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3300dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
330115091d37SBarry Smith {
330215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3303690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3304dfbe8321SBarry Smith   PetscErrorCode    ierr;
3305690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3306d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3307d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
3308d9fead3dSBarry Smith   const PetscScalar *b;
3309690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
331015091d37SBarry Smith 
331115091d37SBarry Smith   PetscFunctionBegin;
3312d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
331415091d37SBarry Smith 
331515091d37SBarry Smith   /* forward solve the lower triangular */
331615091d37SBarry Smith   idx    = 0;
331715091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
331815091d37SBarry Smith   for (i=1; i<n; i++) {
331915091d37SBarry Smith     v     =  aa      + 4*ai[i];
332015091d37SBarry Smith     vi    =  aj      + ai[i];
332115091d37SBarry Smith     nz    =  diag[i] - ai[i];
332215091d37SBarry Smith     idx   +=  2;
3323f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
332415091d37SBarry Smith     while (nz--) {
332515091d37SBarry Smith       jdx   = 2*(*vi++);
332615091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
3327f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3328f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
332915091d37SBarry Smith       v    += 4;
333015091d37SBarry Smith     }
3331f1af5d2fSBarry Smith     x[idx]   = s1;
3332f1af5d2fSBarry Smith     x[1+idx] = s2;
333315091d37SBarry Smith   }
333415091d37SBarry Smith   /* backward solve the upper triangular */
333515091d37SBarry Smith   for (i=n-1; i>=0; i--){
333615091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
333715091d37SBarry Smith     vi   = aj + diag[i] + 1;
333815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
333915091d37SBarry Smith     idt  = 2*i;
3340f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
334115091d37SBarry Smith     while (nz--) {
334215091d37SBarry Smith       idx   = 2*(*vi++);
334315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
3344f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3345f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
334615091d37SBarry Smith       v    += 4;
334715091d37SBarry Smith     }
334815091d37SBarry Smith     v        = aa +  4*diag[i];
3349f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
3350f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
335115091d37SBarry Smith   }
335215091d37SBarry Smith 
3353d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33541ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3355dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
335615091d37SBarry Smith   PetscFunctionReturn(0);
335715091d37SBarry Smith }
335815091d37SBarry Smith 
33594a2ae208SSatish Balay #undef __FUNCT__
3360*cee9d6f2SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
3361*cee9d6f2SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3362*cee9d6f2SShri Abhyankar {
3363*cee9d6f2SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3364*cee9d6f2SShri Abhyankar     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3365*cee9d6f2SShri Abhyankar     PetscErrorCode    ierr;
3366*cee9d6f2SShri Abhyankar     PetscInt          jdx;
3367*cee9d6f2SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3368*cee9d6f2SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
3369*cee9d6f2SShri Abhyankar     const PetscScalar *b;
3370*cee9d6f2SShri Abhyankar 
3371*cee9d6f2SShri Abhyankar     PetscFunctionBegin;
3372*cee9d6f2SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3373*cee9d6f2SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3374*cee9d6f2SShri Abhyankar     /* forward solve the lower triangular */
3375*cee9d6f2SShri Abhyankar     idx    = 0;
3376*cee9d6f2SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
3377*cee9d6f2SShri Abhyankar     for (i=1; i<n; i++) {
3378*cee9d6f2SShri Abhyankar         v   = aa + 4*ai[i];
3379*cee9d6f2SShri Abhyankar        vi   = aj + ai[i];
3380*cee9d6f2SShri Abhyankar        nz   = ai[i+1] - ai[i];
3381*cee9d6f2SShri Abhyankar        idx  = 2*i;
3382*cee9d6f2SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
3383*cee9d6f2SShri Abhyankar        while (nz--) {
3384*cee9d6f2SShri Abhyankar           jdx   = 2*(*vi++);
3385*cee9d6f2SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
3386*cee9d6f2SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
3387*cee9d6f2SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
3388*cee9d6f2SShri Abhyankar            v   +=  4;
3389*cee9d6f2SShri Abhyankar         }
3390*cee9d6f2SShri Abhyankar        x[idx]   = s1;
3391*cee9d6f2SShri Abhyankar        x[1+idx] = s2;
3392*cee9d6f2SShri Abhyankar     }
3393*cee9d6f2SShri Abhyankar 
3394*cee9d6f2SShri Abhyankar    /* backward solve the upper triangular */
3395*cee9d6f2SShri Abhyankar   for (i=n-1; i>=0; i--){
3396*cee9d6f2SShri Abhyankar      v   = aa + 4*ai[2*n-i];
3397*cee9d6f2SShri Abhyankar      vi  = aj + ai[2*n-i];
3398*cee9d6f2SShri Abhyankar      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
3399*cee9d6f2SShri Abhyankar      idt = 2*i;
3400*cee9d6f2SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
3401*cee9d6f2SShri Abhyankar      while (nz--) {
3402*cee9d6f2SShri Abhyankar       idx   = 2*(*vi++);
3403*cee9d6f2SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
3404*cee9d6f2SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
3405*cee9d6f2SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
3406*cee9d6f2SShri Abhyankar          v    += 4;
3407*cee9d6f2SShri Abhyankar     }
3408*cee9d6f2SShri Abhyankar     /* x = inv_diagonal*x */
3409*cee9d6f2SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
3410*cee9d6f2SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
3411*cee9d6f2SShri Abhyankar   }
3412*cee9d6f2SShri Abhyankar 
3413*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3414*cee9d6f2SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3415*cee9d6f2SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
3416*cee9d6f2SShri Abhyankar   PetscFunctionReturn(0);
3417*cee9d6f2SShri Abhyankar }
3418*cee9d6f2SShri Abhyankar 
3419*cee9d6f2SShri Abhyankar #undef __FUNCT__
34204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3421dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
34224e2b4712SSatish Balay {
34234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
34244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
34256849ba73SBarry Smith   PetscErrorCode ierr;
34265d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
34275d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
34283f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
342987828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
34304e2b4712SSatish Balay 
34314e2b4712SSatish Balay   PetscFunctionBegin;
34324e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
34334e2b4712SSatish Balay 
34341ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
34351ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3436f1af5d2fSBarry Smith   t  = a->solve_work;
34374e2b4712SSatish Balay 
34384e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
34394e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
34404e2b4712SSatish Balay 
34414e2b4712SSatish Balay   /* forward solve the lower triangular */
3442f1af5d2fSBarry Smith   t[0] = b[*r++];
34434e2b4712SSatish Balay   for (i=1; i<n; i++) {
34444e2b4712SSatish Balay     v     = aa + ai[i];
34454e2b4712SSatish Balay     vi    = aj + ai[i];
34464e2b4712SSatish Balay     nz    = diag[i] - ai[i];
3447f1af5d2fSBarry Smith     s1  = b[*r++];
34484e2b4712SSatish Balay     while (nz--) {
3449f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
34504e2b4712SSatish Balay     }
3451f1af5d2fSBarry Smith     t[i] = s1;
34524e2b4712SSatish Balay   }
34534e2b4712SSatish Balay   /* backward solve the upper triangular */
34544e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
34554e2b4712SSatish Balay     v    = aa + diag[i] + 1;
34564e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
34574e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3458f1af5d2fSBarry Smith     s1 = t[i];
34594e2b4712SSatish Balay     while (nz--) {
3460f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
34614e2b4712SSatish Balay     }
3462f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
34634e2b4712SSatish Balay   }
34644e2b4712SSatish Balay 
34654e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
34664e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
34671ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
34681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3469dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
34704e2b4712SSatish Balay   PetscFunctionReturn(0);
34714e2b4712SSatish Balay }
347215091d37SBarry Smith /*
347315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
347415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
347515091d37SBarry Smith */
34764a2ae208SSatish Balay #undef __FUNCT__
34774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3478dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
347915091d37SBarry Smith {
348015091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3481690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3482dfbe8321SBarry Smith   PetscErrorCode ierr;
3483690b6cddSBarry Smith   PetscInt       *diag = a->diag;
348415091d37SBarry Smith   MatScalar      *aa=a->a;
348587828ca2SBarry Smith   PetscScalar    *x,*b;
348687828ca2SBarry Smith   PetscScalar    s1,x1;
348715091d37SBarry Smith   MatScalar      *v;
3488690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
348915091d37SBarry Smith 
349015091d37SBarry Smith   PetscFunctionBegin;
34911ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
34921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
349315091d37SBarry Smith 
349415091d37SBarry Smith   /* forward solve the lower triangular */
349515091d37SBarry Smith   idx    = 0;
349615091d37SBarry Smith   x[0]   = b[0];
349715091d37SBarry Smith   for (i=1; i<n; i++) {
349815091d37SBarry Smith     v     =  aa      + ai[i];
349915091d37SBarry Smith     vi    =  aj      + ai[i];
350015091d37SBarry Smith     nz    =  diag[i] - ai[i];
350115091d37SBarry Smith     idx   +=  1;
3502f1af5d2fSBarry Smith     s1  =  b[idx];
350315091d37SBarry Smith     while (nz--) {
350415091d37SBarry Smith       jdx   = *vi++;
350515091d37SBarry Smith       x1    = x[jdx];
3506f1af5d2fSBarry Smith       s1 -= v[0]*x1;
350715091d37SBarry Smith       v    += 1;
350815091d37SBarry Smith     }
3509f1af5d2fSBarry Smith     x[idx]   = s1;
351015091d37SBarry Smith   }
351115091d37SBarry Smith   /* backward solve the upper triangular */
351215091d37SBarry Smith   for (i=n-1; i>=0; i--){
351315091d37SBarry Smith     v    = aa + diag[i] + 1;
351415091d37SBarry Smith     vi   = aj + diag[i] + 1;
351515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
351615091d37SBarry Smith     idt  = i;
3517f1af5d2fSBarry Smith     s1 = x[idt];
351815091d37SBarry Smith     while (nz--) {
351915091d37SBarry Smith       idx   = *vi++;
352015091d37SBarry Smith       x1    = x[idx];
3521f1af5d2fSBarry Smith       s1 -= v[0]*x1;
352215091d37SBarry Smith       v    += 1;
352315091d37SBarry Smith     }
352415091d37SBarry Smith     v        = aa +  diag[i];
3525f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
352615091d37SBarry Smith   }
35271ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
35281ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3529dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
353015091d37SBarry Smith   PetscFunctionReturn(0);
353115091d37SBarry Smith }
35324e2b4712SSatish Balay 
35334e2b4712SSatish Balay /* ----------------------------------------------------------------*/
35346bce7ff8SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
35356bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
35366bce7ff8SHong Zhang 
353784a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
35386bce7ff8SHong Zhang #undef __FUNCT__
35396bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
35406bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
35416bce7ff8SHong Zhang {
35426bce7ff8SHong Zhang   Mat            C=B;
35436bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
35446bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
35456bce7ff8SHong Zhang   PetscErrorCode ierr;
35466bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
35476bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
35486bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3549914a18a2SHong Zhang   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3550914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3551914a18a2SHong Zhang   MatScalar      *v_work;
35526bce7ff8SHong Zhang 
35536bce7ff8SHong Zhang   PetscFunctionBegin;
35546bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
35556bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3556914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3557914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
35586bce7ff8SHong Zhang   ics  = ic;
35596bce7ff8SHong Zhang 
3560914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
3561914a18a2SHong Zhang   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3562914a18a2SHong Zhang   multiplier = v_work + bs;
3563914a18a2SHong Zhang   v_pivots   = (PetscInt*)(multiplier + bs2);
3564914a18a2SHong Zhang 
35656bce7ff8SHong Zhang   for (i=0; i<n; i++){
35666bce7ff8SHong Zhang     /* zero rtmp */
35676bce7ff8SHong Zhang     /* L part */
35686bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
35696bce7ff8SHong Zhang     bjtmp = bj + bi[i];
3570914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3571914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3572914a18a2SHong Zhang     }
35736bce7ff8SHong Zhang 
35746bce7ff8SHong Zhang     /* U part */
35756bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
35766bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
3577914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3578914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3579914a18a2SHong Zhang     }
35806bce7ff8SHong Zhang 
35816bce7ff8SHong Zhang     /* load in initial (unfactored row) */
35826bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
35836bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
3584914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
35856bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3586914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
35876bce7ff8SHong Zhang     }
35886bce7ff8SHong Zhang 
35896bce7ff8SHong Zhang     /* elimination */
35906bce7ff8SHong Zhang     bjtmp = bj + bi[i];
35916bce7ff8SHong Zhang     row   = *bjtmp++;
35926bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
35936bce7ff8SHong Zhang     k   = 0;
35946bce7ff8SHong Zhang     while  (k < nzL) {
3595914a18a2SHong Zhang       pc = rtmp + bs2*row;
3596914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3597914a18a2SHong Zhang       if (flg) {
3598914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
3599914a18a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
36006bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3601914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
36026bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3603914a18a2SHong Zhang         for (j=0; j<nz; j++) {
3604914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3605914a18a2SHong Zhang         }
36066bce7ff8SHong Zhang         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
36076bce7ff8SHong Zhang       }
36086bce7ff8SHong Zhang       row = *bjtmp++; k++;
36096bce7ff8SHong Zhang     }
36106bce7ff8SHong Zhang 
36116bce7ff8SHong Zhang     /* finished row so stick it into b->a */
36126bce7ff8SHong Zhang     /* L part */
3613914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
36146bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
36156bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
36166bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3617914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
36186bce7ff8SHong Zhang     }
36196bce7ff8SHong Zhang 
36206bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
3621914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
36226bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
3623914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3624914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3625914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
36266bce7ff8SHong Zhang 
36276bce7ff8SHong Zhang     /* U part */
3628914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
36296bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
36306bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3631914a18a2SHong Zhang     for (j=0; j<nz; j++){
3632914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3633914a18a2SHong Zhang     }
36346bce7ff8SHong Zhang   }
36356bce7ff8SHong Zhang 
36366bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
36376bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
36386bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
36396bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
364027019359SHong Zhang 
364127019359SHong Zhang   switch (A->rmap->bs){
364227019359SHong Zhang   case 2:
364327019359SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
364427019359SHong Zhang     break;
3645*cee9d6f2SShri Abhyankar   case 3:
3646*cee9d6f2SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct;
3647*cee9d6f2SShri Abhyankar     break;
3648*cee9d6f2SShri Abhyankar   case 4:
3649*cee9d6f2SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct;
3650*cee9d6f2SShri Abhyankar     break;
365127019359SHong Zhang   case 5:
365284a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
365327019359SHong Zhang     break;
3654*cee9d6f2SShri Abhyankar   case 6:
3655*cee9d6f2SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct;
3656*cee9d6f2SShri Abhyankar     break;
3657*cee9d6f2SShri Abhyankar   case 7:
3658*cee9d6f2SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct;
3659*cee9d6f2SShri Abhyankar     break;
366027019359SHong Zhang   default:
366184a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
366227019359SHong Zhang     break;
366384a281e5SHong Zhang   }
36646bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
3665914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
36666bce7ff8SHong Zhang   PetscFunctionReturn(0);
36676bce7ff8SHong Zhang }
36686bce7ff8SHong Zhang 
36696bce7ff8SHong Zhang /*
36706bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
36716bce7ff8SHong Zhang    Factored arrays bj and ba are stored as
36726bce7ff8SHong Zhang      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
36736bce7ff8SHong Zhang 
36746bce7ff8SHong Zhang    bi=fact->i is an array of size 2n+2, in which
36756bce7ff8SHong Zhang    bi+
36766bce7ff8SHong Zhang      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
36776bce7ff8SHong Zhang      bi[n]      ->  end of L(n-1,:)+1
36786bce7ff8SHong Zhang      bi[n+1]    ->  1st entry of U(n-1,:)
36796bce7ff8SHong Zhang      bi[2n-i]   ->  1st entry of U(i,:)
36806bce7ff8SHong Zhang      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
36816bce7ff8SHong Zhang      bi[2n]     ->  end of U(0,:)+1
36826bce7ff8SHong Zhang 
36836bce7ff8SHong Zhang    U(i,:) contains diag[i] as its last entry, i.e.,
36846bce7ff8SHong Zhang     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
36856bce7ff8SHong Zhang */
36866bce7ff8SHong Zhang #undef __FUNCT__
36876bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
36886bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
36896bce7ff8SHong Zhang {
36906bce7ff8SHong Zhang 
36916bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
36926bce7ff8SHong Zhang   PetscErrorCode     ierr;
3693914a18a2SHong Zhang   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
36946bce7ff8SHong Zhang   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
36956bce7ff8SHong Zhang 
36966bce7ff8SHong Zhang   PetscFunctionBegin;
36976bce7ff8SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
36986bce7ff8SHong Zhang   b     = (Mat_SeqBAIJ*)(fact)->data;
3699914a18a2SHong Zhang   bdiag = b->diag;
37006bce7ff8SHong Zhang 
37016bce7ff8SHong Zhang   /* replace matrix arrays with single allocations, then reset values */
37026bce7ff8SHong Zhang   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
37036bce7ff8SHong Zhang 
37046bce7ff8SHong Zhang   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
37056bce7ff8SHong Zhang   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
37066bce7ff8SHong Zhang   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
37076bce7ff8SHong Zhang   b->singlemalloc = PETSC_FALSE;
37086bce7ff8SHong Zhang   if (mbs > 0) {
37096bce7ff8SHong Zhang     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
37106bce7ff8SHong Zhang   }
37116bce7ff8SHong Zhang 
37126bce7ff8SHong Zhang   /* set bi and bj with new data structure */
37136bce7ff8SHong Zhang   bi = b->i;
37146bce7ff8SHong Zhang   bj = b->j;
37156bce7ff8SHong Zhang 
37166bce7ff8SHong Zhang   /* L part */
37176bce7ff8SHong Zhang   bi[0] = 0;
37186bce7ff8SHong Zhang   for (i=0; i<mbs; i++){
37196bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
3720914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
37216bce7ff8SHong Zhang     aj = a->j + ai[i];
37226bce7ff8SHong Zhang     for (j=0; j<nz; j++){
37236bce7ff8SHong Zhang       *bj = aj[j]; bj++;
37246bce7ff8SHong Zhang     }
37256bce7ff8SHong Zhang   }
37266bce7ff8SHong Zhang 
37276bce7ff8SHong Zhang   /* U part */
37286bce7ff8SHong Zhang   bi[mbs+1] = bi[mbs];
37296bce7ff8SHong Zhang   for (i=mbs-1; i>=0; i--){
37306bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
37316bce7ff8SHong Zhang     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3732914a18a2SHong Zhang     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
37336bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
37346bce7ff8SHong Zhang     for (j=0; j<nz; j++){
37356bce7ff8SHong Zhang       *bj = aj[j]; bj++;
37366bce7ff8SHong Zhang     }
37376bce7ff8SHong Zhang     /* diag[i] */
37386bce7ff8SHong Zhang     *bj = i; bj++;
37396bce7ff8SHong Zhang     bdiag[i] = bi[2*mbs-i+1]-1;
37406bce7ff8SHong Zhang   }
37416bce7ff8SHong Zhang   PetscFunctionReturn(0);
37426bce7ff8SHong Zhang }
37436bce7ff8SHong Zhang 
37444e2b4712SSatish Balay /*
37454e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
37464e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
37474e2b4712SSatish Balay    Not a good example of code reuse.
37484e2b4712SSatish Balay */
3749435faa5fSBarry Smith 
37504a2ae208SSatish Balay #undef __FUNCT__
37514a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
37520481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
37534e2b4712SSatish Balay {
37544e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
37554e2b4712SSatish Balay   IS             isicol;
37566849ba73SBarry Smith   PetscErrorCode ierr;
37575d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
37585d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3759a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3760d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
376141df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
3762329f5518SBarry Smith   PetscReal      f;
37634e2b4712SSatish Balay 
37644e2b4712SSatish Balay   PetscFunctionBegin;
37656bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
37666bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
37676bce7ff8SHong Zhang 
3768435faa5fSBarry Smith   f             = info->fill;
3769690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
3770690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
37714c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3772667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3773667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
37747d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
3775309c388cSBarry Smith 
377641df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
37776bce7ff8SHong Zhang 
37786bce7ff8SHong Zhang     PetscTruth newdatastruct=PETSC_FALSE;
37796bce7ff8SHong Zhang     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
37806bce7ff8SHong Zhang     if (newdatastruct){
37816bce7ff8SHong Zhang       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
37826bce7ff8SHong Zhang       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
37836bce7ff8SHong Zhang     } else {
3784719d5645SBarry Smith       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
37856bce7ff8SHong Zhang       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
37866bce7ff8SHong Zhang     }
37876bce7ff8SHong Zhang 
3788719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
3789719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
3790bb3d539aSBarry Smith     b->row       = isrow;
3791bb3d539aSBarry Smith     b->col       = iscol;
3792bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3793bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3794bb3d539aSBarry Smith     b->icol      = isicol;
3795bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3796719d5645SBarry Smith     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
37976bce7ff8SHong Zhang     PetscFunctionReturn(0);
37986bce7ff8SHong Zhang   }
37996bce7ff8SHong Zhang 
38006bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
38014e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
38024e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
38034e2b4712SSatish Balay 
38044e2b4712SSatish Balay     /* get new row pointers */
3805690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
38064e2b4712SSatish Balay     ainew[0] = 0;
38074e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
3808690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
3809690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
38104e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
3811690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
38124e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3813690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
38144e2b4712SSatish Balay     /* im is level for each filled value */
3815690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
38164e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3817690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
38184e2b4712SSatish Balay     dloc[0]  = 0;
38194e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3820435faa5fSBarry Smith 
3821435faa5fSBarry Smith       /* copy prow into linked list */
38224e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
38233b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
38244e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
38254e2b4712SSatish Balay       fill[n]    = n;
3826435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
38274e2b4712SSatish Balay       while (nz--) {
38284e2b4712SSatish Balay 	fm  = n;
38294e2b4712SSatish Balay 	idx = ic[*xi++];
38304e2b4712SSatish Balay 	do {
38314e2b4712SSatish Balay 	  m  = fm;
38324e2b4712SSatish Balay 	  fm = fill[m];
38334e2b4712SSatish Balay 	} while (fm < idx);
38344e2b4712SSatish Balay 	fill[m]   = idx;
38354e2b4712SSatish Balay 	fill[idx] = fm;
38364e2b4712SSatish Balay 	im[idx]   = 0;
38374e2b4712SSatish Balay       }
3838435faa5fSBarry Smith 
3839435faa5fSBarry Smith       /* make sure diagonal entry is included */
3840435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3841435faa5fSBarry Smith 	fm = n;
3842435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3843435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3844435faa5fSBarry Smith 	fill[fm]   = prow;
3845435faa5fSBarry Smith 	im[prow]   = 0;
3846435faa5fSBarry Smith 	nzf++;
3847335d9088SBarry Smith 	dcount++;
3848435faa5fSBarry Smith       }
3849435faa5fSBarry Smith 
38504e2b4712SSatish Balay       nzi = 0;
38514e2b4712SSatish Balay       row = fill[n];
38524e2b4712SSatish Balay       while (row < prow) {
38534e2b4712SSatish Balay 	incrlev = im[row] + 1;
38544e2b4712SSatish Balay 	nz      = dloc[row];
3855435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
38564e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
38574e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
38584e2b4712SSatish Balay 	fm      = row;
38594e2b4712SSatish Balay 	while (nnz-- > 0) {
38604e2b4712SSatish Balay 	  idx = *xi++;
38614e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
38624e2b4712SSatish Balay 	    flev++;
38634e2b4712SSatish Balay 	    continue;
38644e2b4712SSatish Balay 	  }
38654e2b4712SSatish Balay 	  do {
38664e2b4712SSatish Balay 	    m  = fm;
38674e2b4712SSatish Balay 	    fm = fill[m];
38684e2b4712SSatish Balay 	  } while (fm < idx);
38694e2b4712SSatish Balay 	  if (fm != idx) {
38704e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
38714e2b4712SSatish Balay 	    fill[m]   = idx;
38724e2b4712SSatish Balay 	    fill[idx] = fm;
38734e2b4712SSatish Balay 	    fm        = idx;
38744e2b4712SSatish Balay 	    nzf++;
3875ecf371e4SBarry Smith 	  } else {
38764e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
38774e2b4712SSatish Balay 	  }
38784e2b4712SSatish Balay 	  flev++;
38794e2b4712SSatish Balay 	}
38804e2b4712SSatish Balay 	row = fill[row];
38814e2b4712SSatish Balay 	nzi++;
38824e2b4712SSatish Balay       }
38834e2b4712SSatish Balay       /* copy new filled row into permanent storage */
38844e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
38854e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3886ecf371e4SBarry Smith 
3887ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3888ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3889ecf371e4SBarry Smith 	/* just double the memory each time */
3890690b6cddSBarry Smith 	PetscInt maxadd = jmax;
3891ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
38924e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
38934e2b4712SSatish Balay 	jmax += maxadd;
3894ecf371e4SBarry Smith 
3895ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
38965d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
38975d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3898606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
38995d0c19d7SBarry Smith 	ajnew = xitmp;
39005d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
39015d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3902606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
39035d0c19d7SBarry Smith 	ajfill = xitmp;
3904eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
39054e2b4712SSatish Balay       }
39065d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
39074e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
39084e2b4712SSatish Balay       dloc[prow]  = nzi;
39094e2b4712SSatish Balay       fm          = fill[n];
39104e2b4712SSatish Balay       while (nzf--) {
39115d0c19d7SBarry Smith 	*xitmp++ = fm;
39124e2b4712SSatish Balay 	*flev++ = im[fm];
39134e2b4712SSatish Balay 	fm      = fill[fm];
39144e2b4712SSatish Balay       }
3915435faa5fSBarry Smith       /* make sure row has diagonal entry */
3916435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
391777431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
39182401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3919435faa5fSBarry Smith       }
39204e2b4712SSatish Balay     }
3921606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
39224e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
39234e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3924606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3925606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
39264e2b4712SSatish Balay 
39276cf91177SBarry Smith #if defined(PETSC_USE_INFO)
39284e2b4712SSatish Balay     {
3929329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3930ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3931ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3932ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3933ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3934335d9088SBarry Smith       if (diagonal_fill) {
3935ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3936335d9088SBarry Smith       }
39374e2b4712SSatish Balay     }
393863ba0a88SBarry Smith #endif
39394e2b4712SSatish Balay 
39404e2b4712SSatish Balay     /* put together the new matrix */
3941719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3942719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3943719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
3944e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
3945e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
39467c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
3947a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
39484e2b4712SSatish Balay     b->j          = ajnew;
39494e2b4712SSatish Balay     b->i          = ainew;
39504e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
39514e2b4712SSatish Balay     b->diag       = dloc;
39524e2b4712SSatish Balay     b->ilen       = 0;
39534e2b4712SSatish Balay     b->imax       = 0;
39544e2b4712SSatish Balay     b->row        = isrow;
39554e2b4712SSatish Balay     b->col        = iscol;
3956bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3957c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3958c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3959e51c0b9cSSatish Balay     b->icol       = isicol;
396087828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
39614e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
39624e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
3963719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
39644e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
39654e2b4712SSatish Balay 
3966719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
3967719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
3968719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
39696bce7ff8SHong Zhang 
397041df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
39718661488fSKris Buschelman   PetscFunctionReturn(0);
39728661488fSKris Buschelman }
39738661488fSKris Buschelman 
3974732ee342SKris Buschelman #undef __FUNCT__
39757e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3976dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
39777e7071cdSKris Buschelman {
397812272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
397912272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
39805a9542e3SKris Buschelman   PetscFunctionBegin;
39817cf1b8d3SKris Buschelman   /* Undo Column scaling */
39827cf1b8d3SKris Buschelman /*    while (nz--) { */
39837cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
39847cf1b8d3SKris Buschelman /*    } */
3985c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3986c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
39877cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
39887cf1b8d3SKris Buschelman }
39897cf1b8d3SKris Buschelman 
39907cf1b8d3SKris Buschelman #undef __FUNCT__
39917cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3992dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
39937cf1b8d3SKris Buschelman {
39947cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3995b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
39962aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
39975a9542e3SKris Buschelman   PetscFunctionBegin;
39980b9da03eSKris Buschelman   /* Is this really necessary? */
399920235379SKris Buschelman   while (nz--) {
40000b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
40017e7071cdSKris Buschelman   }
4002c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
40037e7071cdSKris Buschelman   PetscFunctionReturn(0);
40047e7071cdSKris Buschelman }
40057e7071cdSKris Buschelman 
4006732ee342SKris Buschelman 
4007