xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 5d0c19d75c660d4fec594a5399ec8d8ba29c54a8)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
34e2b4712SSatish Balay /*
44e2b4712SSatish Balay     Factorization code for BAIJ format.
54e2b4712SSatish Balay */
64e2b4712SSatish Balay 
74e2b4712SSatish Balay #include "src/mat/impls/baij/seq/baij.h"
84e2b4712SSatish Balay #include "src/inline/ilu.h"
974c49faeSBarry Smith #include "src/inline/dot.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16dfbe8321SBarry Smith   PetscErrorCode ierr;
17690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18690b6cddSBarry Smith   PetscInt       *diag = a->diag;
19f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2087828ca2SBarry Smith   PetscScalar    s1,*x,*b;
21f1af5d2fSBarry Smith 
22f1af5d2fSBarry Smith   PetscFunctionBegin;
23ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26f1af5d2fSBarry Smith 
27f1af5d2fSBarry Smith   /* forward solve the U^T */
28f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
29f1af5d2fSBarry Smith 
30f1af5d2fSBarry Smith     v     = aa + diag[i];
31f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
32ef66eb69SBarry Smith     s1    = (*v++)*x[i];
33f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
34f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
35f1af5d2fSBarry Smith     while (nz--) {
36f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
37f1af5d2fSBarry Smith     }
38f1af5d2fSBarry Smith     x[i]   = s1;
39f1af5d2fSBarry Smith   }
40f1af5d2fSBarry Smith   /* backward solve the L^T */
41f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
42f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
43f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
44f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
45f1af5d2fSBarry Smith     s1   = x[i];
46f1af5d2fSBarry Smith     while (nz--) {
47f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
48f1af5d2fSBarry Smith     }
49f1af5d2fSBarry Smith   }
501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53f1af5d2fSBarry Smith   PetscFunctionReturn(0);
54f1af5d2fSBarry Smith }
55f1af5d2fSBarry Smith 
564a2ae208SSatish Balay #undef __FUNCT__
574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59f1af5d2fSBarry Smith {
60f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61dfbe8321SBarry Smith   PetscErrorCode ierr;
62690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
64f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6587828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6687828ca2SBarry Smith   PetscScalar    *x,*b;
67f1af5d2fSBarry Smith 
68f1af5d2fSBarry Smith   PetscFunctionBegin;
69ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
701ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72f1af5d2fSBarry Smith 
73f1af5d2fSBarry Smith   /* forward solve the U^T */
74f1af5d2fSBarry Smith   idx = 0;
75f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
76f1af5d2fSBarry Smith 
77f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
78f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
79ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
80f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
81f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
82f1af5d2fSBarry Smith     v += 4;
83f1af5d2fSBarry Smith 
84f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
85f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
86f1af5d2fSBarry Smith     while (nz--) {
87f1af5d2fSBarry Smith       oidx = 2*(*vi++);
88f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90f1af5d2fSBarry Smith       v  += 4;
91f1af5d2fSBarry Smith     }
92f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
93f1af5d2fSBarry Smith     idx += 2;
94f1af5d2fSBarry Smith   }
95f1af5d2fSBarry Smith   /* backward solve the L^T */
96f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
97f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
98f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
99f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
100f1af5d2fSBarry Smith     idt  = 2*i;
101f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
102f1af5d2fSBarry Smith     while (nz--) {
103f1af5d2fSBarry Smith       idx   = 2*(*vi--);
104f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106f1af5d2fSBarry Smith       v -= 4;
107f1af5d2fSBarry Smith     }
108f1af5d2fSBarry Smith   }
1091ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
112f1af5d2fSBarry Smith   PetscFunctionReturn(0);
113f1af5d2fSBarry Smith }
114f1af5d2fSBarry Smith 
1154a2ae208SSatish Balay #undef __FUNCT__
1164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118f1af5d2fSBarry Smith {
119f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120dfbe8321SBarry Smith   PetscErrorCode ierr;
121690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
123f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12487828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12587828ca2SBarry Smith   PetscScalar    *x,*b;
126f1af5d2fSBarry Smith 
127f1af5d2fSBarry Smith   PetscFunctionBegin;
128ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1291ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith   /* forward solve the U^T */
133f1af5d2fSBarry Smith   idx = 0;
134f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
135f1af5d2fSBarry Smith 
136f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
137f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
138ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142f1af5d2fSBarry Smith     v += 9;
143f1af5d2fSBarry Smith 
144f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
145f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
146f1af5d2fSBarry Smith     while (nz--) {
147f1af5d2fSBarry Smith       oidx = 3*(*vi++);
148f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151f1af5d2fSBarry Smith       v  += 9;
152f1af5d2fSBarry Smith     }
153f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154f1af5d2fSBarry Smith     idx += 3;
155f1af5d2fSBarry Smith   }
156f1af5d2fSBarry Smith   /* backward solve the L^T */
157f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
158f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
159f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
160f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
161f1af5d2fSBarry Smith     idt  = 3*i;
162f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163f1af5d2fSBarry Smith     while (nz--) {
164f1af5d2fSBarry Smith       idx   = 3*(*vi--);
165f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168f1af5d2fSBarry Smith       v -= 9;
169f1af5d2fSBarry Smith     }
170f1af5d2fSBarry Smith   }
1711ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
174f1af5d2fSBarry Smith   PetscFunctionReturn(0);
175f1af5d2fSBarry Smith }
176f1af5d2fSBarry Smith 
1774a2ae208SSatish Balay #undef __FUNCT__
1784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180f1af5d2fSBarry Smith {
181f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182dfbe8321SBarry Smith   PetscErrorCode ierr;
183690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
185f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18787828ca2SBarry Smith   PetscScalar    *x,*b;
188f1af5d2fSBarry Smith 
189f1af5d2fSBarry Smith   PetscFunctionBegin;
190ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith   /* forward solve the U^T */
195f1af5d2fSBarry Smith   idx = 0;
196f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
197f1af5d2fSBarry Smith 
198f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
199f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
200ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205f1af5d2fSBarry Smith     v += 16;
206f1af5d2fSBarry Smith 
207f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
208f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
209f1af5d2fSBarry Smith     while (nz--) {
210f1af5d2fSBarry Smith       oidx = 4*(*vi++);
211f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215f1af5d2fSBarry Smith       v  += 16;
216f1af5d2fSBarry Smith     }
217f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218f1af5d2fSBarry Smith     idx += 4;
219f1af5d2fSBarry Smith   }
220f1af5d2fSBarry Smith   /* backward solve the L^T */
221f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
222f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
223f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
224f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
225f1af5d2fSBarry Smith     idt  = 4*i;
226f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227f1af5d2fSBarry Smith     while (nz--) {
228f1af5d2fSBarry Smith       idx   = 4*(*vi--);
229f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233f1af5d2fSBarry Smith       v -= 16;
234f1af5d2fSBarry Smith     }
235f1af5d2fSBarry Smith   }
2361ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
239f1af5d2fSBarry Smith   PetscFunctionReturn(0);
240f1af5d2fSBarry Smith }
241f1af5d2fSBarry Smith 
2424a2ae208SSatish Balay #undef __FUNCT__
2434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245f1af5d2fSBarry Smith {
246f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247dfbe8321SBarry Smith   PetscErrorCode ierr;
248690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
250f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25287828ca2SBarry Smith   PetscScalar    *x,*b;
253f1af5d2fSBarry Smith 
254f1af5d2fSBarry Smith   PetscFunctionBegin;
255ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2561ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2571ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258f1af5d2fSBarry Smith 
259f1af5d2fSBarry Smith   /* forward solve the U^T */
260f1af5d2fSBarry Smith   idx = 0;
261f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
262f1af5d2fSBarry Smith 
263f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
264f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
265ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271f1af5d2fSBarry Smith     v += 25;
272f1af5d2fSBarry Smith 
273f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
274f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
275f1af5d2fSBarry Smith     while (nz--) {
276f1af5d2fSBarry Smith       oidx = 5*(*vi++);
277f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282f1af5d2fSBarry Smith       v  += 25;
283f1af5d2fSBarry Smith     }
284f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285f1af5d2fSBarry Smith     idx += 5;
286f1af5d2fSBarry Smith   }
287f1af5d2fSBarry Smith   /* backward solve the L^T */
288f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
289f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
290f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
291f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
292f1af5d2fSBarry Smith     idt  = 5*i;
293f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294f1af5d2fSBarry Smith     while (nz--) {
295f1af5d2fSBarry Smith       idx   = 5*(*vi--);
296f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301f1af5d2fSBarry Smith       v -= 25;
302f1af5d2fSBarry Smith     }
303f1af5d2fSBarry Smith   }
3041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
307f1af5d2fSBarry Smith   PetscFunctionReturn(0);
308f1af5d2fSBarry Smith }
309f1af5d2fSBarry Smith 
3104a2ae208SSatish Balay #undef __FUNCT__
3114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313f1af5d2fSBarry Smith {
314f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315dfbe8321SBarry Smith   PetscErrorCode ierr;
316690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
318f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
31987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32087828ca2SBarry Smith   PetscScalar    *x,*b;
321f1af5d2fSBarry Smith 
322f1af5d2fSBarry Smith   PetscFunctionBegin;
323ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326f1af5d2fSBarry Smith 
327f1af5d2fSBarry Smith   /* forward solve the U^T */
328f1af5d2fSBarry Smith   idx = 0;
329f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
330f1af5d2fSBarry Smith 
331f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
332f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
333ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334ef66eb69SBarry Smith     x6    = x[5+idx];
335f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341f1af5d2fSBarry Smith     v += 36;
342f1af5d2fSBarry Smith 
343f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
344f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
345f1af5d2fSBarry Smith     while (nz--) {
346f1af5d2fSBarry Smith       oidx = 6*(*vi++);
347f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353f1af5d2fSBarry Smith       v  += 36;
354f1af5d2fSBarry Smith     }
355f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356f1af5d2fSBarry Smith     x[5+idx] = s6;
357f1af5d2fSBarry Smith     idx += 6;
358f1af5d2fSBarry Smith   }
359f1af5d2fSBarry Smith   /* backward solve the L^T */
360f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
361f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
362f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
363f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
364f1af5d2fSBarry Smith     idt  = 6*i;
365f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366f1af5d2fSBarry Smith     s6 = x[5+idt];
367f1af5d2fSBarry Smith     while (nz--) {
368f1af5d2fSBarry Smith       idx   = 6*(*vi--);
369f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375f1af5d2fSBarry Smith       v -= 36;
376f1af5d2fSBarry Smith     }
377f1af5d2fSBarry Smith   }
3781ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
381f1af5d2fSBarry Smith   PetscFunctionReturn(0);
382f1af5d2fSBarry Smith }
383f1af5d2fSBarry Smith 
3844a2ae208SSatish Balay #undef __FUNCT__
3854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387f1af5d2fSBarry Smith {
388f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389dfbe8321SBarry Smith   PetscErrorCode ierr;
390690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
392f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39487828ca2SBarry Smith   PetscScalar    *x,*b;
395f1af5d2fSBarry Smith 
396f1af5d2fSBarry Smith   PetscFunctionBegin;
397ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3981ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3991ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400f1af5d2fSBarry Smith 
401f1af5d2fSBarry Smith   /* forward solve the U^T */
402f1af5d2fSBarry Smith   idx = 0;
403f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
404f1af5d2fSBarry Smith 
405f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
406f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
407ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
409f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416f1af5d2fSBarry Smith     v += 49;
417f1af5d2fSBarry Smith 
418f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
419f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
420f1af5d2fSBarry Smith     while (nz--) {
421f1af5d2fSBarry Smith       oidx = 7*(*vi++);
422f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429f1af5d2fSBarry Smith       v  += 49;
430f1af5d2fSBarry Smith     }
431f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
433f1af5d2fSBarry Smith     idx += 7;
434f1af5d2fSBarry Smith   }
435f1af5d2fSBarry Smith   /* backward solve the L^T */
436f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
437f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
438f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
439f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
440f1af5d2fSBarry Smith     idt  = 7*i;
441f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
443f1af5d2fSBarry Smith     while (nz--) {
444f1af5d2fSBarry Smith       idx   = 7*(*vi--);
445f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452f1af5d2fSBarry Smith       v -= 49;
453f1af5d2fSBarry Smith     }
454f1af5d2fSBarry Smith   }
4551ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457d0f46423SBarry Smith   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
458f1af5d2fSBarry Smith   PetscFunctionReturn(0);
459f1af5d2fSBarry Smith }
460f1af5d2fSBarry Smith 
461f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4624a2ae208SSatish Balay #undef __FUNCT__
4634a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465f1af5d2fSBarry Smith {
466f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4686849ba73SBarry Smith   PetscErrorCode ierr;
469*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
470*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471690b6cddSBarry Smith   PetscInt       *diag = a->diag;
472f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47387828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
474f1af5d2fSBarry Smith 
475f1af5d2fSBarry Smith   PetscFunctionBegin;
4761ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478f1af5d2fSBarry Smith   t  = a->solve_work;
479f1af5d2fSBarry Smith 
480f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
482f1af5d2fSBarry Smith 
483f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
484f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
485f1af5d2fSBarry Smith     t[i] = b[c[i]];
486f1af5d2fSBarry Smith   }
487f1af5d2fSBarry Smith 
488f1af5d2fSBarry Smith   /* forward solve the U^T */
489f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
490f1af5d2fSBarry Smith 
491f1af5d2fSBarry Smith     v     = aa + diag[i];
492f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
493f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
494f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
495f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
496f1af5d2fSBarry Smith     while (nz--) {
497f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
498f1af5d2fSBarry Smith     }
499f1af5d2fSBarry Smith     t[i]   = s1;
500f1af5d2fSBarry Smith   }
501f1af5d2fSBarry Smith   /* backward solve the L^T */
502f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
503f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
504f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
505f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
506f1af5d2fSBarry Smith     s1   = t[i];
507f1af5d2fSBarry Smith     while (nz--) {
508f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
509f1af5d2fSBarry Smith     }
510f1af5d2fSBarry Smith   }
511f1af5d2fSBarry Smith 
512f1af5d2fSBarry Smith   /* copy t into x according to permutation */
513f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
514f1af5d2fSBarry Smith     x[r[i]]   = t[i];
515f1af5d2fSBarry Smith   }
516f1af5d2fSBarry Smith 
517f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
518f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5191ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
522f1af5d2fSBarry Smith   PetscFunctionReturn(0);
523f1af5d2fSBarry Smith }
524f1af5d2fSBarry Smith 
5254a2ae208SSatish Balay #undef __FUNCT__
5264a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
527dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528f1af5d2fSBarry Smith {
529f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5316849ba73SBarry Smith   PetscErrorCode ierr;
532*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
533*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
535f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53687828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
538f1af5d2fSBarry Smith 
539f1af5d2fSBarry Smith   PetscFunctionBegin;
5401ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
542f1af5d2fSBarry Smith   t  = a->solve_work;
543f1af5d2fSBarry Smith 
544f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
545f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
546f1af5d2fSBarry Smith 
547f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
548f1af5d2fSBarry Smith   ii = 0;
549f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
550f1af5d2fSBarry Smith     ic      = 2*c[i];
551f1af5d2fSBarry Smith     t[ii]   = b[ic];
552f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
553f1af5d2fSBarry Smith     ii += 2;
554f1af5d2fSBarry Smith   }
555f1af5d2fSBarry Smith 
556f1af5d2fSBarry Smith   /* forward solve the U^T */
557f1af5d2fSBarry Smith   idx = 0;
558f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
559f1af5d2fSBarry Smith 
560f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
561f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
562f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
563f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
564f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
565f1af5d2fSBarry Smith     v += 4;
566f1af5d2fSBarry Smith 
567f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
568f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
569f1af5d2fSBarry Smith     while (nz--) {
570f1af5d2fSBarry Smith       oidx = 2*(*vi++);
571f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
572f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
573f1af5d2fSBarry Smith       v  += 4;
574f1af5d2fSBarry Smith     }
575f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
576f1af5d2fSBarry Smith     idx += 2;
577f1af5d2fSBarry Smith   }
578f1af5d2fSBarry Smith   /* backward solve the L^T */
579f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
580f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
581f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
582f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
583f1af5d2fSBarry Smith     idt  = 2*i;
584f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
585f1af5d2fSBarry Smith     while (nz--) {
586f1af5d2fSBarry Smith       idx   = 2*(*vi--);
587f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
588f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
589f1af5d2fSBarry Smith       v -= 4;
590f1af5d2fSBarry Smith     }
591f1af5d2fSBarry Smith   }
592f1af5d2fSBarry Smith 
593f1af5d2fSBarry Smith   /* copy t into x according to permutation */
594f1af5d2fSBarry Smith   ii = 0;
595f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
596f1af5d2fSBarry Smith     ir      = 2*r[i];
597f1af5d2fSBarry Smith     x[ir]   = t[ii];
598f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
599f1af5d2fSBarry Smith     ii += 2;
600f1af5d2fSBarry Smith   }
601f1af5d2fSBarry Smith 
602f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
603f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
606d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
607f1af5d2fSBarry Smith   PetscFunctionReturn(0);
608f1af5d2fSBarry Smith }
609f1af5d2fSBarry Smith 
6104a2ae208SSatish Balay #undef __FUNCT__
6114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
612dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613f1af5d2fSBarry Smith {
614f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
615f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6166849ba73SBarry Smith   PetscErrorCode ierr;
617*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
618*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
620f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62187828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
623f1af5d2fSBarry Smith 
624f1af5d2fSBarry Smith   PetscFunctionBegin;
6251ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6261ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
627f1af5d2fSBarry Smith   t  = a->solve_work;
628f1af5d2fSBarry Smith 
629f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
630f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
631f1af5d2fSBarry Smith 
632f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
633f1af5d2fSBarry Smith   ii = 0;
634f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
635f1af5d2fSBarry Smith     ic      = 3*c[i];
636f1af5d2fSBarry Smith     t[ii]   = b[ic];
637f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
638f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
639f1af5d2fSBarry Smith     ii += 3;
640f1af5d2fSBarry Smith   }
641f1af5d2fSBarry Smith 
642f1af5d2fSBarry Smith   /* forward solve the U^T */
643f1af5d2fSBarry Smith   idx = 0;
644f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
645f1af5d2fSBarry Smith 
646f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
647f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
648f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
649f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
650f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
651f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
652f1af5d2fSBarry Smith     v += 9;
653f1af5d2fSBarry Smith 
654f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
655f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
656f1af5d2fSBarry Smith     while (nz--) {
657f1af5d2fSBarry Smith       oidx = 3*(*vi++);
658f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
659f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
660f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661f1af5d2fSBarry Smith       v  += 9;
662f1af5d2fSBarry Smith     }
663f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
664f1af5d2fSBarry Smith     idx += 3;
665f1af5d2fSBarry Smith   }
666f1af5d2fSBarry Smith   /* backward solve the L^T */
667f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
668f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
669f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
670f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
671f1af5d2fSBarry Smith     idt  = 3*i;
672f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
673f1af5d2fSBarry Smith     while (nz--) {
674f1af5d2fSBarry Smith       idx   = 3*(*vi--);
675f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
676f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
677f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678f1af5d2fSBarry Smith       v -= 9;
679f1af5d2fSBarry Smith     }
680f1af5d2fSBarry Smith   }
681f1af5d2fSBarry Smith 
682f1af5d2fSBarry Smith   /* copy t into x according to permutation */
683f1af5d2fSBarry Smith   ii = 0;
684f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
685f1af5d2fSBarry Smith     ir      = 3*r[i];
686f1af5d2fSBarry Smith     x[ir]   = t[ii];
687f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
688f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
689f1af5d2fSBarry Smith     ii += 3;
690f1af5d2fSBarry Smith   }
691f1af5d2fSBarry Smith 
692f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
693f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6941ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
696d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
697f1af5d2fSBarry Smith   PetscFunctionReturn(0);
698f1af5d2fSBarry Smith }
699f1af5d2fSBarry Smith 
7004a2ae208SSatish Balay #undef __FUNCT__
7014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
702dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703f1af5d2fSBarry Smith {
704f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
705f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7066849ba73SBarry Smith   PetscErrorCode ierr;
707*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
708*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
710f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
713f1af5d2fSBarry Smith 
714f1af5d2fSBarry Smith   PetscFunctionBegin;
7151ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717f1af5d2fSBarry Smith   t  = a->solve_work;
718f1af5d2fSBarry Smith 
719f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
720f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
721f1af5d2fSBarry Smith 
722f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
723f1af5d2fSBarry Smith   ii = 0;
724f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
725f1af5d2fSBarry Smith     ic      = 4*c[i];
726f1af5d2fSBarry Smith     t[ii]   = b[ic];
727f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
728f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
729f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
730f1af5d2fSBarry Smith     ii += 4;
731f1af5d2fSBarry Smith   }
732f1af5d2fSBarry Smith 
733f1af5d2fSBarry Smith   /* forward solve the U^T */
734f1af5d2fSBarry Smith   idx = 0;
735f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
736f1af5d2fSBarry Smith 
737f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
738f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
739f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
740f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
741f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
742f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
743f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744f1af5d2fSBarry Smith     v += 16;
745f1af5d2fSBarry Smith 
746f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
747f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
748f1af5d2fSBarry Smith     while (nz--) {
749f1af5d2fSBarry Smith       oidx = 4*(*vi++);
750f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
751f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
752f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754f1af5d2fSBarry Smith       v  += 16;
755f1af5d2fSBarry Smith     }
756f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757f1af5d2fSBarry Smith     idx += 4;
758f1af5d2fSBarry Smith   }
759f1af5d2fSBarry Smith   /* backward solve the L^T */
760f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
761f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
762f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
763f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
764f1af5d2fSBarry Smith     idt  = 4*i;
765f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766f1af5d2fSBarry Smith     while (nz--) {
767f1af5d2fSBarry Smith       idx   = 4*(*vi--);
768f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
769f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
770f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772f1af5d2fSBarry Smith       v -= 16;
773f1af5d2fSBarry Smith     }
774f1af5d2fSBarry Smith   }
775f1af5d2fSBarry Smith 
776f1af5d2fSBarry Smith   /* copy t into x according to permutation */
777f1af5d2fSBarry Smith   ii = 0;
778f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
779f1af5d2fSBarry Smith     ir      = 4*r[i];
780f1af5d2fSBarry Smith     x[ir]   = t[ii];
781f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
782f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
783f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
784f1af5d2fSBarry Smith     ii += 4;
785f1af5d2fSBarry Smith   }
786f1af5d2fSBarry Smith 
787f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
792f1af5d2fSBarry Smith   PetscFunctionReturn(0);
793f1af5d2fSBarry Smith }
794f1af5d2fSBarry Smith 
7954a2ae208SSatish Balay #undef __FUNCT__
7964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
797dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798f1af5d2fSBarry Smith {
799f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
800f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8016849ba73SBarry Smith   PetscErrorCode ierr;
802*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
803*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
805f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
808f1af5d2fSBarry Smith 
809f1af5d2fSBarry Smith   PetscFunctionBegin;
8101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
812f1af5d2fSBarry Smith   t  = a->solve_work;
813f1af5d2fSBarry Smith 
814f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
815f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
816f1af5d2fSBarry Smith 
817f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
818f1af5d2fSBarry Smith   ii = 0;
819f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
820f1af5d2fSBarry Smith     ic      = 5*c[i];
821f1af5d2fSBarry Smith     t[ii]   = b[ic];
822f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
823f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
824f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
825f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
826f1af5d2fSBarry Smith     ii += 5;
827f1af5d2fSBarry Smith   }
828f1af5d2fSBarry Smith 
829f1af5d2fSBarry Smith   /* forward solve the U^T */
830f1af5d2fSBarry Smith   idx = 0;
831f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
832f1af5d2fSBarry Smith 
833f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
834f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
835f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
837f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
838f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841f1af5d2fSBarry Smith     v += 25;
842f1af5d2fSBarry Smith 
843f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
844f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
845f1af5d2fSBarry Smith     while (nz--) {
846f1af5d2fSBarry Smith       oidx = 5*(*vi++);
847f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852f1af5d2fSBarry Smith       v  += 25;
853f1af5d2fSBarry Smith     }
854f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855f1af5d2fSBarry Smith     idx += 5;
856f1af5d2fSBarry Smith   }
857f1af5d2fSBarry Smith   /* backward solve the L^T */
858f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
859f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
860f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
861f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
862f1af5d2fSBarry Smith     idt  = 5*i;
863f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864f1af5d2fSBarry Smith     while (nz--) {
865f1af5d2fSBarry Smith       idx   = 5*(*vi--);
866f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
867f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
868f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871f1af5d2fSBarry Smith       v -= 25;
872f1af5d2fSBarry Smith     }
873f1af5d2fSBarry Smith   }
874f1af5d2fSBarry Smith 
875f1af5d2fSBarry Smith   /* copy t into x according to permutation */
876f1af5d2fSBarry Smith   ii = 0;
877f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
878f1af5d2fSBarry Smith     ir      = 5*r[i];
879f1af5d2fSBarry Smith     x[ir]   = t[ii];
880f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
881f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
882f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
883f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
884f1af5d2fSBarry Smith     ii += 5;
885f1af5d2fSBarry Smith   }
886f1af5d2fSBarry Smith 
887f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
888f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
891d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
892f1af5d2fSBarry Smith   PetscFunctionReturn(0);
893f1af5d2fSBarry Smith }
894f1af5d2fSBarry Smith 
8954a2ae208SSatish Balay #undef __FUNCT__
8964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
897dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898f1af5d2fSBarry Smith {
899f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
900f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9016849ba73SBarry Smith   PetscErrorCode ierr;
902*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
903*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
905f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
908f1af5d2fSBarry Smith 
909f1af5d2fSBarry Smith   PetscFunctionBegin;
9101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
912f1af5d2fSBarry Smith   t  = a->solve_work;
913f1af5d2fSBarry Smith 
914f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
915f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
916f1af5d2fSBarry Smith 
917f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
918f1af5d2fSBarry Smith   ii = 0;
919f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
920f1af5d2fSBarry Smith     ic      = 6*c[i];
921f1af5d2fSBarry Smith     t[ii]   = b[ic];
922f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
923f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
924f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
925f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
926f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
927f1af5d2fSBarry Smith     ii += 6;
928f1af5d2fSBarry Smith   }
929f1af5d2fSBarry Smith 
930f1af5d2fSBarry Smith   /* forward solve the U^T */
931f1af5d2fSBarry Smith   idx = 0;
932f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
933f1af5d2fSBarry Smith 
934f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
935f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
936f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937f1af5d2fSBarry Smith     x6    = t[5+idx];
938f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
939f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
940f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944f1af5d2fSBarry Smith     v += 36;
945f1af5d2fSBarry Smith 
946f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
947f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
948f1af5d2fSBarry Smith     while (nz--) {
949f1af5d2fSBarry Smith       oidx = 6*(*vi++);
950f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
951f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
952f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956f1af5d2fSBarry Smith       v  += 36;
957f1af5d2fSBarry Smith     }
958f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959f1af5d2fSBarry Smith     t[5+idx] = s6;
960f1af5d2fSBarry Smith     idx += 6;
961f1af5d2fSBarry Smith   }
962f1af5d2fSBarry Smith   /* backward solve the L^T */
963f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
964f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
965f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
966f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
967f1af5d2fSBarry Smith     idt  = 6*i;
968f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969f1af5d2fSBarry Smith     s6 = t[5+idt];
970f1af5d2fSBarry Smith     while (nz--) {
971f1af5d2fSBarry Smith       idx   = 6*(*vi--);
972f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
973f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
974f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978f1af5d2fSBarry Smith       v -= 36;
979f1af5d2fSBarry Smith     }
980f1af5d2fSBarry Smith   }
981f1af5d2fSBarry Smith 
982f1af5d2fSBarry Smith   /* copy t into x according to permutation */
983f1af5d2fSBarry Smith   ii = 0;
984f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
985f1af5d2fSBarry Smith     ir      = 6*r[i];
986f1af5d2fSBarry Smith     x[ir]   = t[ii];
987f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
988f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
989f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
990f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
991f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
992f1af5d2fSBarry Smith     ii += 6;
993f1af5d2fSBarry Smith   }
994f1af5d2fSBarry Smith 
995f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
996f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9971ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
1000f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1001f1af5d2fSBarry Smith }
1002f1af5d2fSBarry Smith 
10034a2ae208SSatish Balay #undef __FUNCT__
10044a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1005dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006f1af5d2fSBarry Smith {
1007f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1008f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10096849ba73SBarry Smith   PetscErrorCode ierr;
1010*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
1011*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1013f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1016f1af5d2fSBarry Smith 
1017f1af5d2fSBarry Smith   PetscFunctionBegin;
10181ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1020f1af5d2fSBarry Smith   t  = a->solve_work;
1021f1af5d2fSBarry Smith 
1022f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1023f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1024f1af5d2fSBarry Smith 
1025f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1026f1af5d2fSBarry Smith   ii = 0;
1027f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1028f1af5d2fSBarry Smith     ic      = 7*c[i];
1029f1af5d2fSBarry Smith     t[ii]   = b[ic];
1030f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1031f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1032f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1033f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1034f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1035f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1036f1af5d2fSBarry Smith     ii += 7;
1037f1af5d2fSBarry Smith   }
1038f1af5d2fSBarry Smith 
1039f1af5d2fSBarry Smith   /* forward solve the U^T */
1040f1af5d2fSBarry Smith   idx = 0;
1041f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1042f1af5d2fSBarry Smith 
1043f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1044f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1045f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1047f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1048f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054f1af5d2fSBarry Smith     v += 49;
1055f1af5d2fSBarry Smith 
1056f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1057f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1058f1af5d2fSBarry Smith     while (nz--) {
1059f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1060f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067f1af5d2fSBarry Smith       v  += 49;
1068f1af5d2fSBarry Smith     }
1069f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1071f1af5d2fSBarry Smith     idx += 7;
1072f1af5d2fSBarry Smith   }
1073f1af5d2fSBarry Smith   /* backward solve the L^T */
1074f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1075f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1076f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1077f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1078f1af5d2fSBarry Smith     idt  = 7*i;
1079f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1081f1af5d2fSBarry Smith     while (nz--) {
1082f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1083f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1084f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090f1af5d2fSBarry Smith       v -= 49;
1091f1af5d2fSBarry Smith     }
1092f1af5d2fSBarry Smith   }
1093f1af5d2fSBarry Smith 
1094f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1095f1af5d2fSBarry Smith   ii = 0;
1096f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1097f1af5d2fSBarry Smith     ir      = 7*r[i];
1098f1af5d2fSBarry Smith     x[ir]   = t[ii];
1099f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1100f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1101f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1102f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1103f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1104f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1105f1af5d2fSBarry Smith     ii += 7;
1106f1af5d2fSBarry Smith   }
1107f1af5d2fSBarry Smith 
1108f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1109f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11101ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1112d0f46423SBarry Smith   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
1113f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1114f1af5d2fSBarry Smith }
1115f1af5d2fSBarry Smith 
11164e2b4712SSatish Balay /* ----------------------------------------------------------- */
11174a2ae208SSatish Balay #undef __FUNCT__
11184a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1119dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11204e2b4712SSatish Balay {
11214e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11224e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11236849ba73SBarry Smith   PetscErrorCode ierr;
1124*5d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
1125*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
1126*5d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11273f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
112887828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11294e2b4712SSatish Balay 
11304e2b4712SSatish Balay   PetscFunctionBegin;
11311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133f1af5d2fSBarry Smith   t  = a->solve_work;
11344e2b4712SSatish Balay 
11354e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11364e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11374e2b4712SSatish Balay 
11384e2b4712SSatish Balay   /* forward solve the lower triangular */
113987828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11404e2b4712SSatish Balay   for (i=1; i<n; i++) {
11414e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11424e2b4712SSatish Balay     vi  = aj + ai[i];
11434e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1144f1af5d2fSBarry Smith     s = t + bs*i;
114587828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11464e2b4712SSatish Balay     while (nz--) {
1147f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11484e2b4712SSatish Balay       v += bs2;
11494e2b4712SSatish Balay     }
11504e2b4712SSatish Balay   }
11514e2b4712SSatish Balay   /* backward solve the upper triangular */
1152d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11534e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11544e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11554e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11564e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115787828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11584e2b4712SSatish Balay     while (nz--) {
1159f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11604e2b4712SSatish Balay       v += bs2;
11614e2b4712SSatish Balay     }
1162f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116387828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11644e2b4712SSatish Balay   }
11654e2b4712SSatish Balay 
11664e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11674e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11681ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11691ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11714e2b4712SSatish Balay   PetscFunctionReturn(0);
11724e2b4712SSatish Balay }
11734e2b4712SSatish Balay 
11744a2ae208SSatish Balay #undef __FUNCT__
11754a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1176dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11774e2b4712SSatish Balay {
11784e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11794e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11806849ba73SBarry Smith   PetscErrorCode ierr;
1181*5d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
1182*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11833f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11864e2b4712SSatish Balay 
11874e2b4712SSatish Balay   PetscFunctionBegin;
11881ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1190f1af5d2fSBarry Smith   t  = a->solve_work;
11914e2b4712SSatish Balay 
11924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11944e2b4712SSatish Balay 
11954e2b4712SSatish Balay   /* forward solve the lower triangular */
11964e2b4712SSatish Balay   idx    = 7*(*r++);
1197f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1198f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12004e2b4712SSatish Balay 
12014e2b4712SSatish Balay   for (i=1; i<n; i++) {
12024e2b4712SSatish Balay     v     = aa + 49*ai[i];
12034e2b4712SSatish Balay     vi    = aj + ai[i];
12044e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12054e2b4712SSatish Balay     idx   = 7*(*r++);
1206f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12084e2b4712SSatish Balay     while (nz--) {
12094e2b4712SSatish Balay       idx   = 7*(*vi++);
1210f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1211f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1212f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1213f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12204e2b4712SSatish Balay       v += 49;
12214e2b4712SSatish Balay     }
12224e2b4712SSatish Balay     idx = 7*i;
1223f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1224f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12264e2b4712SSatish Balay   }
12274e2b4712SSatish Balay   /* backward solve the upper triangular */
12284e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12294e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12304e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12314e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12324e2b4712SSatish Balay     idt  = 7*i;
1233f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1234f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12364e2b4712SSatish Balay     while (nz--) {
12374e2b4712SSatish Balay       idx   = 7*(*vi++);
1238f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1239f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1241f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12484e2b4712SSatish Balay       v += 49;
12494e2b4712SSatish Balay     }
12504e2b4712SSatish Balay     idc = 7*(*c--);
12514e2b4712SSatish Balay     v   = aa + 49*diag[i];
1252f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1253f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12664e2b4712SSatish Balay   }
12674e2b4712SSatish Balay 
12684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1272d0f46423SBarry Smith   ierr = PetscLogFlops(2*49*(a->nz) - 7*A->cmap->n);CHKERRQ(ierr);
12734e2b4712SSatish Balay   PetscFunctionReturn(0);
12744e2b4712SSatish Balay }
12754e2b4712SSatish Balay 
12764a2ae208SSatish Balay #undef __FUNCT__
12774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1278dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
127915091d37SBarry Smith {
128015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1281690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282dfbe8321SBarry Smith   PetscErrorCode    ierr;
1283690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1284d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1285d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286d9fead3dSBarry Smith   const PetscScalar *b;
128715091d37SBarry Smith 
128815091d37SBarry Smith   PetscFunctionBegin;
1289d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12901ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129115091d37SBarry Smith   /* forward solve the lower triangular */
129215091d37SBarry Smith   idx    = 0;
129315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
129415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
129515091d37SBarry Smith   x[6] = b[6+idx];
129615091d37SBarry Smith   for (i=1; i<n; i++) {
129715091d37SBarry Smith     v     =  aa + 49*ai[i];
129815091d37SBarry Smith     vi    =  aj + ai[i];
129915091d37SBarry Smith     nz    =  diag[i] - ai[i];
130015091d37SBarry Smith     idx   =  7*i;
1301f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1302f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303f1af5d2fSBarry Smith     s7  =  b[6+idx];
130415091d37SBarry Smith     while (nz--) {
130515091d37SBarry Smith       jdx   = 7*(*vi++);
130615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
130715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
130815091d37SBarry Smith       x7    = x[6+jdx];
1309f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
131615091d37SBarry Smith       v += 49;
131715091d37SBarry Smith      }
1318f1af5d2fSBarry Smith     x[idx]   = s1;
1319f1af5d2fSBarry Smith     x[1+idx] = s2;
1320f1af5d2fSBarry Smith     x[2+idx] = s3;
1321f1af5d2fSBarry Smith     x[3+idx] = s4;
1322f1af5d2fSBarry Smith     x[4+idx] = s5;
1323f1af5d2fSBarry Smith     x[5+idx] = s6;
1324f1af5d2fSBarry Smith     x[6+idx] = s7;
132515091d37SBarry Smith   }
132615091d37SBarry Smith   /* backward solve the upper triangular */
132715091d37SBarry Smith   for (i=n-1; i>=0; i--){
132815091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
132915091d37SBarry Smith     vi   = aj + diag[i] + 1;
133015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
133115091d37SBarry Smith     idt  = 7*i;
1332f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1333f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1334f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1335f1af5d2fSBarry Smith     s7 = x[6+idt];
133615091d37SBarry Smith     while (nz--) {
133715091d37SBarry Smith       idx   = 7*(*vi++);
133815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
133915091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
134015091d37SBarry Smith       x7    = x[6+idx];
1341f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134815091d37SBarry Smith       v += 49;
134915091d37SBarry Smith     }
135015091d37SBarry Smith     v        = aa + 49*diag[i];
1351f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1352f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1354f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1356f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1358f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1360f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1362f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1364f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
136515091d37SBarry Smith   }
136615091d37SBarry Smith 
1367d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1369d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
137015091d37SBarry Smith   PetscFunctionReturn(0);
137115091d37SBarry Smith }
137215091d37SBarry Smith 
13734a2ae208SSatish Balay #undef __FUNCT__
13744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1375dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
137615091d37SBarry Smith {
137715091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
137815091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
13796849ba73SBarry Smith   PetscErrorCode    ierr;
1380*5d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1381*5d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1382d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1383d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1384d9fead3dSBarry Smith   const PetscScalar *b;
138515091d37SBarry Smith   PetscFunctionBegin;
1386d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13871ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1388f1af5d2fSBarry Smith   t  = a->solve_work;
138915091d37SBarry Smith 
139015091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
139115091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
139215091d37SBarry Smith 
139315091d37SBarry Smith   /* forward solve the lower triangular */
139415091d37SBarry Smith   idx    = 6*(*r++);
1395f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1396f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1397f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
139815091d37SBarry Smith   for (i=1; i<n; i++) {
139915091d37SBarry Smith     v     = aa + 36*ai[i];
140015091d37SBarry Smith     vi    = aj + ai[i];
140115091d37SBarry Smith     nz    = diag[i] - ai[i];
140215091d37SBarry Smith     idx   = 6*(*r++);
1403f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1404f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
140515091d37SBarry Smith     while (nz--) {
140615091d37SBarry Smith       idx   = 6*(*vi++);
1407f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1408f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1409f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
141515091d37SBarry Smith       v += 36;
141615091d37SBarry Smith     }
141715091d37SBarry Smith     idx = 6*i;
1418f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1419f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1420f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
142115091d37SBarry Smith   }
142215091d37SBarry Smith   /* backward solve the upper triangular */
142315091d37SBarry Smith   for (i=n-1; i>=0; i--){
142415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
142515091d37SBarry Smith     vi   = aj + diag[i] + 1;
142615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
142715091d37SBarry Smith     idt  = 6*i;
1428f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1429f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1430f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
143115091d37SBarry Smith     while (nz--) {
143215091d37SBarry Smith       idx   = 6*(*vi++);
1433f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1434f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1435f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1436f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1437f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1438f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1439f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1440f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1441f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
144215091d37SBarry Smith       v += 36;
144315091d37SBarry Smith     }
144415091d37SBarry Smith     idc = 6*(*c--);
144515091d37SBarry Smith     v   = aa + 36*diag[i];
1446f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1447f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1448f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1449f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1450f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1451f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1452f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1453f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1454f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1455f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1456f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1457f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
145815091d37SBarry Smith   }
145915091d37SBarry Smith 
146015091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
146115091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1462d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1464d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
146515091d37SBarry Smith   PetscFunctionReturn(0);
146615091d37SBarry Smith }
146715091d37SBarry Smith 
14684a2ae208SSatish Balay #undef __FUNCT__
14694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1470dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
147115091d37SBarry Smith {
147215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1473690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1474dfbe8321SBarry Smith   PetscErrorCode    ierr;
1475690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1476d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1477d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1478d9fead3dSBarry Smith   const PetscScalar *b;
147915091d37SBarry Smith 
148015091d37SBarry Smith   PetscFunctionBegin;
1481d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
148315091d37SBarry Smith   /* forward solve the lower triangular */
148415091d37SBarry Smith   idx    = 0;
148515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
148615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
148715091d37SBarry Smith   for (i=1; i<n; i++) {
148815091d37SBarry Smith     v     =  aa + 36*ai[i];
148915091d37SBarry Smith     vi    =  aj + ai[i];
149015091d37SBarry Smith     nz    =  diag[i] - ai[i];
149115091d37SBarry Smith     idx   =  6*i;
1492f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1493f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
149415091d37SBarry Smith     while (nz--) {
149515091d37SBarry Smith       jdx   = 6*(*vi++);
149615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
149715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1498f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1499f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1500f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1501f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1502f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1503f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150415091d37SBarry Smith       v += 36;
150515091d37SBarry Smith      }
1506f1af5d2fSBarry Smith     x[idx]   = s1;
1507f1af5d2fSBarry Smith     x[1+idx] = s2;
1508f1af5d2fSBarry Smith     x[2+idx] = s3;
1509f1af5d2fSBarry Smith     x[3+idx] = s4;
1510f1af5d2fSBarry Smith     x[4+idx] = s5;
1511f1af5d2fSBarry Smith     x[5+idx] = s6;
151215091d37SBarry Smith   }
151315091d37SBarry Smith   /* backward solve the upper triangular */
151415091d37SBarry Smith   for (i=n-1; i>=0; i--){
151515091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
151615091d37SBarry Smith     vi   = aj + diag[i] + 1;
151715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
151815091d37SBarry Smith     idt  = 6*i;
1519f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1520f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1521f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
152215091d37SBarry Smith     while (nz--) {
152315091d37SBarry Smith       idx   = 6*(*vi++);
152415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
152515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1526f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
153215091d37SBarry Smith       v += 36;
153315091d37SBarry Smith     }
153415091d37SBarry Smith     v        = aa + 36*diag[i];
1535f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1536f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1537f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1538f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1539f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1540f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
154115091d37SBarry Smith   }
154215091d37SBarry Smith 
1543d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1545d0f46423SBarry Smith   ierr = PetscLogFlops(2*36*(a->nz) - 6*A->cmap->n);CHKERRQ(ierr);
154615091d37SBarry Smith   PetscFunctionReturn(0);
154715091d37SBarry Smith }
154815091d37SBarry Smith 
15494a2ae208SSatish Balay #undef __FUNCT__
15504a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1551dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15524e2b4712SSatish Balay {
15534e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
15544e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
15556849ba73SBarry Smith   PetscErrorCode    ierr;
1556*5d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
1557*5d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1558d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1559d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1560d9fead3dSBarry Smith   const PetscScalar *b;
15614e2b4712SSatish Balay 
15624e2b4712SSatish Balay   PetscFunctionBegin;
1563d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1565f1af5d2fSBarry Smith   t  = a->solve_work;
15664e2b4712SSatish Balay 
15674e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15684e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15694e2b4712SSatish Balay 
15704e2b4712SSatish Balay   /* forward solve the lower triangular */
15714e2b4712SSatish Balay   idx    = 5*(*r++);
1572f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1573f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15744e2b4712SSatish Balay   for (i=1; i<n; i++) {
15754e2b4712SSatish Balay     v     = aa + 25*ai[i];
15764e2b4712SSatish Balay     vi    = aj + ai[i];
15774e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15784e2b4712SSatish Balay     idx   = 5*(*r++);
1579f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1580f1af5d2fSBarry Smith     s5  = b[4+idx];
15814e2b4712SSatish Balay     while (nz--) {
15824e2b4712SSatish Balay       idx   = 5*(*vi++);
1583f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1584f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1585f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1586f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1587f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1588f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1589f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15904e2b4712SSatish Balay       v += 25;
15914e2b4712SSatish Balay     }
15924e2b4712SSatish Balay     idx = 5*i;
1593f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1594f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15954e2b4712SSatish Balay   }
15964e2b4712SSatish Balay   /* backward solve the upper triangular */
15974e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15984e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15994e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
16004e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
16014e2b4712SSatish Balay     idt  = 5*i;
1602f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1603f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
16044e2b4712SSatish Balay     while (nz--) {
16054e2b4712SSatish Balay       idx   = 5*(*vi++);
1606f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1607f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1608f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1609f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1610f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1611f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1612f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
16134e2b4712SSatish Balay       v += 25;
16144e2b4712SSatish Balay     }
16154e2b4712SSatish Balay     idc = 5*(*c--);
16164e2b4712SSatish Balay     v   = aa + 25*diag[i];
1617f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1618f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1619f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1620f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1621f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1622f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1623f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1624f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1625f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1626f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
16274e2b4712SSatish Balay   }
16284e2b4712SSatish Balay 
16294e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
16304e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1631d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16321ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
16344e2b4712SSatish Balay   PetscFunctionReturn(0);
16354e2b4712SSatish Balay }
16364e2b4712SSatish Balay 
16374a2ae208SSatish Balay #undef __FUNCT__
16384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1639dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
164015091d37SBarry Smith {
164115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1642690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1643dfbe8321SBarry Smith   PetscErrorCode    ierr;
1644690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1645d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1646d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1647d9fead3dSBarry Smith   const PetscScalar *b;
164815091d37SBarry Smith 
164915091d37SBarry Smith   PetscFunctionBegin;
1650d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16511ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
165215091d37SBarry Smith   /* forward solve the lower triangular */
165315091d37SBarry Smith   idx    = 0;
165415091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
165515091d37SBarry Smith   for (i=1; i<n; i++) {
165615091d37SBarry Smith     v     =  aa + 25*ai[i];
165715091d37SBarry Smith     vi    =  aj + ai[i];
165815091d37SBarry Smith     nz    =  diag[i] - ai[i];
165915091d37SBarry Smith     idx   =  5*i;
1660f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
166115091d37SBarry Smith     while (nz--) {
166215091d37SBarry Smith       jdx   = 5*(*vi++);
166315091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1664f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1665f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1666f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1667f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1668f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
166915091d37SBarry Smith       v    += 25;
167015091d37SBarry Smith     }
1671f1af5d2fSBarry Smith     x[idx]   = s1;
1672f1af5d2fSBarry Smith     x[1+idx] = s2;
1673f1af5d2fSBarry Smith     x[2+idx] = s3;
1674f1af5d2fSBarry Smith     x[3+idx] = s4;
1675f1af5d2fSBarry Smith     x[4+idx] = s5;
167615091d37SBarry Smith   }
167715091d37SBarry Smith   /* backward solve the upper triangular */
167815091d37SBarry Smith   for (i=n-1; i>=0; i--){
167915091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
168015091d37SBarry Smith     vi   = aj + diag[i] + 1;
168115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
168215091d37SBarry Smith     idt  = 5*i;
1683f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1684f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
168515091d37SBarry Smith     while (nz--) {
168615091d37SBarry Smith       idx   = 5*(*vi++);
168715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1688f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1689f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1690f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1691f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1692f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
169315091d37SBarry Smith       v    += 25;
169415091d37SBarry Smith     }
169515091d37SBarry Smith     v        = aa + 25*diag[i];
1696f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1697f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1698f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1699f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1700f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
170115091d37SBarry Smith   }
170215091d37SBarry Smith 
1703d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1705d0f46423SBarry Smith   ierr = PetscLogFlops(2*25*(a->nz) - 5*A->cmap->n);CHKERRQ(ierr);
170615091d37SBarry Smith   PetscFunctionReturn(0);
170715091d37SBarry Smith }
170815091d37SBarry Smith 
17094a2ae208SSatish Balay #undef __FUNCT__
17104a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1711dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
17124e2b4712SSatish Balay {
17134e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
17144e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
17156849ba73SBarry Smith   PetscErrorCode    ierr;
1716*5d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1717*5d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1718d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1719d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1720d9fead3dSBarry Smith   const PetscScalar *b;
17214e2b4712SSatish Balay 
17224e2b4712SSatish Balay   PetscFunctionBegin;
1723d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17241ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1725f1af5d2fSBarry Smith   t  = a->solve_work;
17264e2b4712SSatish Balay 
17274e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17284e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
17294e2b4712SSatish Balay 
17304e2b4712SSatish Balay   /* forward solve the lower triangular */
17314e2b4712SSatish Balay   idx    = 4*(*r++);
1732f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1733f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
17344e2b4712SSatish Balay   for (i=1; i<n; i++) {
17354e2b4712SSatish Balay     v     = aa + 16*ai[i];
17364e2b4712SSatish Balay     vi    = aj + ai[i];
17374e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17384e2b4712SSatish Balay     idx   = 4*(*r++);
1739f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
17404e2b4712SSatish Balay     while (nz--) {
17414e2b4712SSatish Balay       idx   = 4*(*vi++);
1742f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1743f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1744f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1745f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1746f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
17474e2b4712SSatish Balay       v    += 16;
17484e2b4712SSatish Balay     }
17494e2b4712SSatish Balay     idx        = 4*i;
1750f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1751f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
17524e2b4712SSatish Balay   }
17534e2b4712SSatish Balay   /* backward solve the upper triangular */
17544e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17554e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
17564e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17574e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17584e2b4712SSatish Balay     idt  = 4*i;
1759f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1760f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
17614e2b4712SSatish Balay     while (nz--) {
17624e2b4712SSatish Balay       idx   = 4*(*vi++);
1763f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1764f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1765f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1766f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1767f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1768f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
17694e2b4712SSatish Balay       v += 16;
17704e2b4712SSatish Balay     }
17714e2b4712SSatish Balay     idc      = 4*(*c--);
17724e2b4712SSatish Balay     v        = aa + 16*diag[i];
1773f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1774f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1775f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1776f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
17774e2b4712SSatish Balay   }
17784e2b4712SSatish Balay 
17794e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
17804e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1781d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17821ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1783d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
17844e2b4712SSatish Balay   PetscFunctionReturn(0);
17854e2b4712SSatish Balay }
1786f26ec98cSKris Buschelman 
1787f26ec98cSKris Buschelman #undef __FUNCT__
1788f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1789dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1790f26ec98cSKris Buschelman {
1791f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1792f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
17936849ba73SBarry Smith   PetscErrorCode    ierr;
1794*5d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1795*5d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1796d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1797d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1798d9fead3dSBarry Smith   PetscScalar       *x;
1799d9fead3dSBarry Smith   const PetscScalar *b;
1800f26ec98cSKris Buschelman 
1801f26ec98cSKris Buschelman   PetscFunctionBegin;
1802d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18031ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1804f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1805f26ec98cSKris Buschelman 
1806f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1807f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1808f26ec98cSKris Buschelman 
1809f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1810f26ec98cSKris Buschelman   idx    = 4*(*r++);
1811f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1812f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1813f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1814f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1815f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1816f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1817f26ec98cSKris Buschelman     vi    = aj + ai[i];
1818f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1819f26ec98cSKris Buschelman     idx   = 4*(*r++);
1820f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1821f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1822f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1823f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1824f26ec98cSKris Buschelman     while (nz--) {
1825f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1826f26ec98cSKris Buschelman       x1  = t[idx];
1827f26ec98cSKris Buschelman       x2  = t[1+idx];
1828f26ec98cSKris Buschelman       x3  = t[2+idx];
1829f26ec98cSKris Buschelman       x4  = t[3+idx];
1830f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1831f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1832f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1833f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1834f26ec98cSKris Buschelman       v    += 16;
1835f26ec98cSKris Buschelman     }
1836f26ec98cSKris Buschelman     idx        = 4*i;
1837f26ec98cSKris Buschelman     t[idx]   = s1;
1838f26ec98cSKris Buschelman     t[1+idx] = s2;
1839f26ec98cSKris Buschelman     t[2+idx] = s3;
1840f26ec98cSKris Buschelman     t[3+idx] = s4;
1841f26ec98cSKris Buschelman   }
1842f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1843f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1844f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1845f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1846f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1847f26ec98cSKris Buschelman     idt  = 4*i;
1848f26ec98cSKris Buschelman     s1 = t[idt];
1849f26ec98cSKris Buschelman     s2 = t[1+idt];
1850f26ec98cSKris Buschelman     s3 = t[2+idt];
1851f26ec98cSKris Buschelman     s4 = t[3+idt];
1852f26ec98cSKris Buschelman     while (nz--) {
1853f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1854f26ec98cSKris Buschelman       x1  = t[idx];
1855f26ec98cSKris Buschelman       x2  = t[1+idx];
1856f26ec98cSKris Buschelman       x3  = t[2+idx];
1857f26ec98cSKris Buschelman       x4  = t[3+idx];
1858f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1859f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1860f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1861f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1862f26ec98cSKris Buschelman       v += 16;
1863f26ec98cSKris Buschelman     }
1864f26ec98cSKris Buschelman     idc      = 4*(*c--);
1865f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1866f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1867f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1868f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1869f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1870f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1871f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1872f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1873f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1874f26ec98cSKris Buschelman  }
1875f26ec98cSKris Buschelman 
1876f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1877f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1878d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1880d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
1881f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1882f26ec98cSKris Buschelman }
1883f26ec98cSKris Buschelman 
188424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
188524c233c2SKris Buschelman 
188624c233c2SKris Buschelman #include PETSC_HAVE_SSE
188724c233c2SKris Buschelman 
188824c233c2SKris Buschelman #undef __FUNCT__
188924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1890dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
189124c233c2SKris Buschelman {
189224c233c2SKris Buschelman   /*
189324c233c2SKris Buschelman      Note: This code uses demotion of double
189424c233c2SKris Buschelman      to float when performing the mixed-mode computation.
189524c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
189624c233c2SKris Buschelman   */
189724c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
189824c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
18996849ba73SBarry Smith   PetscErrorCode ierr;
1900*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
1901*5d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
190224c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
190387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
190424c233c2SKris Buschelman 
190524c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
190624c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
190724c233c2SKris Buschelman   unsigned long   offset;
190824c233c2SKris Buschelman 
190924c233c2SKris Buschelman   PetscFunctionBegin;
191024c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
191124c233c2SKris Buschelman 
191224c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
191324c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
191424c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
191524c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
191624c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
191724c233c2SKris Buschelman 
19181ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
19191ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192024c233c2SKris Buschelman     t  = a->solve_work;
192124c233c2SKris Buschelman 
192224c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
192324c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
192424c233c2SKris Buschelman 
192524c233c2SKris Buschelman     /* forward solve the lower triangular */
192624c233c2SKris Buschelman     idx  = 4*(*r++);
192724c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
192824c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
192924c233c2SKris Buschelman     v    =  aa + 16*ai[1];
193024c233c2SKris Buschelman 
193124c233c2SKris Buschelman     for (i=1; i<n;) {
193224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
193324c233c2SKris Buschelman       vi   =  aj      + ai[i];
193424c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
193524c233c2SKris Buschelman       idx  =  4*(*r++);
193624c233c2SKris Buschelman 
193724c233c2SKris Buschelman       /* Demote sum from double to float */
193824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
193924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
194024c233c2SKris Buschelman 
194124c233c2SKris Buschelman       while (nz--) {
194224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
194324c233c2SKris Buschelman         idx = 4*(*vi++);
194424c233c2SKris Buschelman 
194524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
194624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
194724c233c2SKris Buschelman 
194824c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
194924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
195024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
195124c233c2SKris Buschelman 
195224c233c2SKris Buschelman           /* First Column */
195324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
195424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
195524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
195624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
195724c233c2SKris Buschelman 
195824c233c2SKris Buschelman           /* Second Column */
195924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
196024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
196124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
196224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
196324c233c2SKris Buschelman 
196424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
196524c233c2SKris Buschelman 
196624c233c2SKris Buschelman           /* Third Column */
196724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
196824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
196924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
197024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
197124c233c2SKris Buschelman 
197224c233c2SKris Buschelman           /* Fourth Column */
197324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
197424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
197524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
197624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
197724c233c2SKris Buschelman         SSE_INLINE_END_2
197824c233c2SKris Buschelman 
197924c233c2SKris Buschelman         v  += 16;
198024c233c2SKris Buschelman       }
198124c233c2SKris Buschelman       idx = 4*i;
198224c233c2SKris Buschelman       v   = aa + 16*ai[++i];
198324c233c2SKris Buschelman       PREFETCH_NTA(v);
198424c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
198524c233c2SKris Buschelman 
198624c233c2SKris Buschelman       /* Promote result from float to double */
198724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
198824c233c2SKris Buschelman     }
198924c233c2SKris Buschelman     /* backward solve the upper triangular */
199024c233c2SKris Buschelman     idt  = 4*(n-1);
199124c233c2SKris Buschelman     ai16 = 16*diag[n-1];
199224c233c2SKris Buschelman     v    = aa + ai16 + 16;
199324c233c2SKris Buschelman     for (i=n-1; i>=0;){
199424c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
199524c233c2SKris Buschelman       vi = aj + diag[i] + 1;
199624c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
199724c233c2SKris Buschelman 
199824c233c2SKris Buschelman       /* Demote accumulator from double to float */
199924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
200024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
200124c233c2SKris Buschelman 
200224c233c2SKris Buschelman       while (nz--) {
200324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
200424c233c2SKris Buschelman         idx = 4*(*vi++);
200524c233c2SKris Buschelman 
200624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
200724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
200824c233c2SKris Buschelman 
200924c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
201024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
201124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
201224c233c2SKris Buschelman 
201324c233c2SKris Buschelman           /* First Column */
201424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
201524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
201624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
201724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
201824c233c2SKris Buschelman 
201924c233c2SKris Buschelman           /* Second Column */
202024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
202124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
202224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
202324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
202424c233c2SKris Buschelman 
202524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
202624c233c2SKris Buschelman 
202724c233c2SKris Buschelman           /* Third Column */
202824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
202924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
203024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
203124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
203224c233c2SKris Buschelman 
203324c233c2SKris Buschelman           /* Fourth Column */
203424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
203524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
203624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
203724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
203824c233c2SKris Buschelman         SSE_INLINE_END_2
203924c233c2SKris Buschelman         v  += 16;
204024c233c2SKris Buschelman       }
204124c233c2SKris Buschelman       v    = aa + ai16;
204224c233c2SKris Buschelman       ai16 = 16*diag[--i];
204324c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
204424c233c2SKris Buschelman       /*
204524c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
204624c233c2SKris Buschelman          which was inverted as part of the factorization
204724c233c2SKris Buschelman       */
204824c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
204924c233c2SKris Buschelman         /* First Column */
205024c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
205124c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
205224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
205324c233c2SKris Buschelman 
205424c233c2SKris Buschelman         /* Second Column */
205524c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
205624c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
205724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
205824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
205924c233c2SKris Buschelman 
206024c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
206124c233c2SKris Buschelman 
206224c233c2SKris Buschelman         /* Third Column */
206324c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
206424c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
206524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
206624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
206724c233c2SKris Buschelman 
206824c233c2SKris Buschelman         /* Fourth Column */
206924c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
207024c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
207124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
207224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
207324c233c2SKris Buschelman 
207424c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
207524c233c2SKris Buschelman       SSE_INLINE_END_3
207624c233c2SKris Buschelman 
207724c233c2SKris Buschelman       /* Promote solution from float to double */
207824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
207924c233c2SKris Buschelman 
208024c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
208124c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
208224c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
208324c233c2SKris Buschelman       idc  = 4*(*c--);
208424c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
208524c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
208624c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
208724c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
208824c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
208924c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
209024c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
209124c233c2SKris Buschelman       SSE_INLINE_END_2
209224c233c2SKris Buschelman       v    = aa + ai16 + 16;
209324c233c2SKris Buschelman       idt -= 4;
209424c233c2SKris Buschelman     }
209524c233c2SKris Buschelman 
209624c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
209724c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
20981ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
20991ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2100d0f46423SBarry Smith     ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
210124c233c2SKris Buschelman   SSE_SCOPE_END;
210224c233c2SKris Buschelman   PetscFunctionReturn(0);
210324c233c2SKris Buschelman }
210424c233c2SKris Buschelman 
210524c233c2SKris Buschelman #endif
21060ef38995SBarry Smith 
21070ef38995SBarry Smith 
21084e2b4712SSatish Balay /*
21094e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
21104e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
21114e2b4712SSatish Balay */
21124a2ae208SSatish Balay #undef __FUNCT__
21134a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2114dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
21154e2b4712SSatish Balay {
21164e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2117356650c2SBarry Smith   PetscInt          n=a->mbs;
2118356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2119dfbe8321SBarry Smith   PetscErrorCode    ierr;
2120356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2121d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2122d9fead3dSBarry Smith   PetscScalar       *x;
2123d9fead3dSBarry Smith   const PetscScalar *b;
21244e2b4712SSatish Balay 
21254e2b4712SSatish Balay   PetscFunctionBegin;
2126d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21284e2b4712SSatish Balay 
2129aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
21302853dc0eSBarry Smith   {
213187828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
21322853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
21332853dc0eSBarry Smith   }
2134aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
21352853dc0eSBarry Smith   {
213687828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
21372853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
21382853dc0eSBarry Smith   }
2139aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
21402853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2141e1293385SBarry Smith #else
214230d4dcafSBarry Smith   {
214387828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2144d9fead3dSBarry Smith     const MatScalar *v;
2145356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2146356650c2SBarry Smith     const PetscInt  *vi;
2147e1293385SBarry Smith 
21484e2b4712SSatish Balay   /* forward solve the lower triangular */
21494e2b4712SSatish Balay   idx    = 0;
2150e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
21514e2b4712SSatish Balay   for (i=1; i<n; i++) {
21524e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
21534e2b4712SSatish Balay     vi    =  aj      + ai[i];
21544e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2155e1293385SBarry Smith     idx   +=  4;
2156f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
21574e2b4712SSatish Balay     while (nz--) {
21584e2b4712SSatish Balay       jdx   = 4*(*vi++);
21594e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2160f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2161f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2162f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2163f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
21644e2b4712SSatish Balay       v    += 16;
21654e2b4712SSatish Balay     }
2166f1af5d2fSBarry Smith     x[idx]   = s1;
2167f1af5d2fSBarry Smith     x[1+idx] = s2;
2168f1af5d2fSBarry Smith     x[2+idx] = s3;
2169f1af5d2fSBarry Smith     x[3+idx] = s4;
21704e2b4712SSatish Balay   }
21714e2b4712SSatish Balay   /* backward solve the upper triangular */
21724e555682SBarry Smith   idt = 4*(n-1);
21734e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
21744e555682SBarry Smith     ai16 = 16*diag[i];
21754e555682SBarry Smith     v    = aa + ai16 + 16;
21764e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
21774e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2178f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2179f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
21804e2b4712SSatish Balay     while (nz--) {
21814e2b4712SSatish Balay       idx   = 4*(*vi++);
21824e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2183f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2184f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2185f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2186f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
21874e2b4712SSatish Balay       v    += 16;
21884e2b4712SSatish Balay     }
21894e555682SBarry Smith     v        = aa + ai16;
2190f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2191f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2192f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2193f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2194329f5518SBarry Smith     idt -= 4;
21954e2b4712SSatish Balay   }
219630d4dcafSBarry Smith   }
2197e1293385SBarry Smith #endif
21984e2b4712SSatish Balay 
2199d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2201d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
22024e2b4712SSatish Balay   PetscFunctionReturn(0);
22034e2b4712SSatish Balay }
22044e2b4712SSatish Balay 
2205f26ec98cSKris Buschelman #undef __FUNCT__
2206f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2207dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2208f26ec98cSKris Buschelman {
2209f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2210690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2211dfbe8321SBarry Smith   PetscErrorCode ierr;
2212690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2213f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2214f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2215f26ec98cSKris Buschelman 
2216f26ec98cSKris Buschelman   PetscFunctionBegin;
22171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2219f26ec98cSKris Buschelman 
2220f26ec98cSKris Buschelman   {
2221f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2222f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2223690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2224f26ec98cSKris Buschelman 
2225f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2226f26ec98cSKris Buschelman     idx  = 0;
2227f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2228f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2229f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2230f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2231f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2232f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2233f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2234f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2235f26ec98cSKris Buschelman       idx   +=  4;
2236f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2237f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2238f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2239f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2240f26ec98cSKris Buschelman       while (nz--) {
2241f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2242f26ec98cSKris Buschelman         x1  = t[jdx];
2243f26ec98cSKris Buschelman         x2  = t[1+jdx];
2244f26ec98cSKris Buschelman         x3  = t[2+jdx];
2245f26ec98cSKris Buschelman         x4  = t[3+jdx];
2246f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2247f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2248f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2249f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2250f26ec98cSKris Buschelman         v    += 16;
2251f26ec98cSKris Buschelman       }
2252f26ec98cSKris Buschelman       t[idx]   = s1;
2253f26ec98cSKris Buschelman       t[1+idx] = s2;
2254f26ec98cSKris Buschelman       t[2+idx] = s3;
2255f26ec98cSKris Buschelman       t[3+idx] = s4;
2256f26ec98cSKris Buschelman     }
2257f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2258f26ec98cSKris Buschelman     idt = 4*(n-1);
2259f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2260f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2261f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2262f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2263f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2264f26ec98cSKris Buschelman       s1   = t[idt];
2265f26ec98cSKris Buschelman       s2   = t[1+idt];
2266f26ec98cSKris Buschelman       s3   = t[2+idt];
2267f26ec98cSKris Buschelman       s4   = t[3+idt];
2268f26ec98cSKris Buschelman       while (nz--) {
2269f26ec98cSKris Buschelman         idx = 4*(*vi++);
2270f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2271f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2272f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2273f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2274f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2275f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2276f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2277f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2278f26ec98cSKris Buschelman         v    += 16;
2279f26ec98cSKris Buschelman       }
2280f26ec98cSKris Buschelman       v        = aa + ai16;
2281f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2282f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2283f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2284f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2285f26ec98cSKris Buschelman       idt -= 4;
2286f26ec98cSKris Buschelman     }
2287f26ec98cSKris Buschelman   }
2288f26ec98cSKris Buschelman 
22891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
22901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2291d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
2292f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2293f26ec98cSKris Buschelman }
2294f26ec98cSKris Buschelman 
22953660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
22963660e330SKris Buschelman 
22973660e330SKris Buschelman #include PETSC_HAVE_SSE
22983660e330SKris Buschelman #undef __FUNCT__
22997cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2300dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
23013660e330SKris Buschelman {
23023660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
23032aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2304dfbe8321SBarry Smith   PetscErrorCode ierr;
2305dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
23063660e330SKris Buschelman   MatScalar      *aa=a->a;
230787828ca2SBarry Smith   PetscScalar    *x,*b;
23083660e330SKris Buschelman 
23093660e330SKris Buschelman   PetscFunctionBegin;
23103660e330SKris Buschelman   SSE_SCOPE_BEGIN;
23113660e330SKris Buschelman   /*
23123660e330SKris Buschelman      Note: This code currently uses demotion of double
23133660e330SKris Buschelman      to float when performing the mixed-mode computation.
23143660e330SKris Buschelman      This may not be numerically reasonable for all applications.
23153660e330SKris Buschelman   */
23163660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
23173660e330SKris Buschelman 
23181ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23203660e330SKris Buschelman   {
2321eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2322eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
23232aa5897fSKris Buschelman     int            nz,i,idt,ai16;
23242aa5897fSKris Buschelman     unsigned int   jdx,idx;
23252aa5897fSKris Buschelman     unsigned short *vi;
2326eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
23273660e330SKris Buschelman 
2328eb05f457SKris Buschelman     /* First block is the identity. */
23293660e330SKris Buschelman     idx  = 0;
2330eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
23312aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
23323660e330SKris Buschelman 
23333660e330SKris Buschelman     for (i=1; i<n;) {
23343660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23353660e330SKris Buschelman       vi   =  aj      + ai[i];
23363660e330SKris Buschelman       nz   =  diag[i] - ai[i];
23373660e330SKris Buschelman       idx +=  4;
23383660e330SKris Buschelman 
2339eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2340eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2341eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
23423660e330SKris Buschelman 
23433660e330SKris Buschelman       while (nz--) {
23443660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23452aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
23463660e330SKris Buschelman 
23473660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2348eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
23493660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
23503660e330SKris Buschelman 
23513660e330SKris Buschelman           /* First Column */
23523660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
23533660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
23543660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
23553660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
23563660e330SKris Buschelman 
23573660e330SKris Buschelman           /* Second Column */
23583660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
23593660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
23603660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
23613660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
23623660e330SKris Buschelman 
23633660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
23643660e330SKris Buschelman 
23653660e330SKris Buschelman           /* Third Column */
23663660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
23673660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
23683660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
23693660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
23703660e330SKris Buschelman 
23713660e330SKris Buschelman           /* Fourth Column */
23723660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
23733660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
23743660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
23753660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
23763660e330SKris Buschelman         SSE_INLINE_END_2
23773660e330SKris Buschelman 
23783660e330SKris Buschelman         v  += 16;
23793660e330SKris Buschelman       }
23803660e330SKris Buschelman       v    =  aa + 16*ai[++i];
23813660e330SKris Buschelman       PREFETCH_NTA(v);
2382eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
23833660e330SKris Buschelman     }
2384eb05f457SKris Buschelman 
2385eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2386eb05f457SKris Buschelman 
23873660e330SKris Buschelman     idt  = 4*(n-1);
23883660e330SKris Buschelman     ai16 = 16*diag[n-1];
23893660e330SKris Buschelman     v    = aa + ai16 + 16;
23903660e330SKris Buschelman     for (i=n-1; i>=0;){
23913660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
23923660e330SKris Buschelman       vi = aj + diag[i] + 1;
23933660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
23943660e330SKris Buschelman 
2395eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
23963660e330SKris Buschelman 
23973660e330SKris Buschelman       while (nz--) {
23983660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
23992aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
24003660e330SKris Buschelman 
24013660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2402eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
24033660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
24043660e330SKris Buschelman 
24053660e330SKris Buschelman           /* First Column */
24063660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
24073660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
24083660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
24093660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
24103660e330SKris Buschelman 
24113660e330SKris Buschelman           /* Second Column */
24123660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
24133660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
24143660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
24153660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
24163660e330SKris Buschelman 
24173660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
24183660e330SKris Buschelman 
24193660e330SKris Buschelman           /* Third Column */
24203660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24213660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24223660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24233660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24243660e330SKris Buschelman 
24253660e330SKris Buschelman           /* Fourth Column */
24263660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
24273660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
24283660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
24293660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
24303660e330SKris Buschelman         SSE_INLINE_END_2
24313660e330SKris Buschelman         v  += 16;
24323660e330SKris Buschelman       }
24333660e330SKris Buschelman       v    = aa + ai16;
24343660e330SKris Buschelman       ai16 = 16*diag[--i];
24353660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
24363660e330SKris Buschelman       /*
24373660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
24383660e330SKris Buschelman          which was inverted as part of the factorization
24393660e330SKris Buschelman       */
2440eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
24413660e330SKris Buschelman         /* First Column */
24423660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
24433660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
24443660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
24453660e330SKris Buschelman 
24463660e330SKris Buschelman         /* Second Column */
24473660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
24483660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
24493660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
24503660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
24513660e330SKris Buschelman 
24523660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
24533660e330SKris Buschelman 
24543660e330SKris Buschelman         /* Third Column */
24553660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
24563660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
24573660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
24583660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
24593660e330SKris Buschelman 
24603660e330SKris Buschelman         /* Fourth Column */
24613660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
24623660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
24633660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
24643660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
24653660e330SKris Buschelman 
24663660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
24673660e330SKris Buschelman       SSE_INLINE_END_3
24683660e330SKris Buschelman 
24693660e330SKris Buschelman       v    = aa + ai16 + 16;
24703660e330SKris Buschelman       idt -= 4;
24713660e330SKris Buschelman     }
2472eb05f457SKris Buschelman 
2473eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2474eb05f457SKris Buschelman     idt = 4*(n-1);
2475eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2476eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2477eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2478eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2479eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2480eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2481eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2482eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2483eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
248454693613SKris Buschelman       idt -= 4;
24853660e330SKris Buschelman     }
2486eb05f457SKris Buschelman 
2487eb05f457SKris Buschelman   } /* End of artificial scope. */
24881ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
24891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2490d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
24913660e330SKris Buschelman   SSE_SCOPE_END;
24923660e330SKris Buschelman   PetscFunctionReturn(0);
24933660e330SKris Buschelman }
24943660e330SKris Buschelman 
24957cf1b8d3SKris Buschelman #undef __FUNCT__
24967cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2497dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
24987cf1b8d3SKris Buschelman {
24997cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
25007cf1b8d3SKris Buschelman   int            *aj=a->j;
2501dfbe8321SBarry Smith   PetscErrorCode ierr;
2502dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
25037cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
25047cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
25057cf1b8d3SKris Buschelman 
25067cf1b8d3SKris Buschelman   PetscFunctionBegin;
25077cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
25087cf1b8d3SKris Buschelman   /*
25097cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
25107cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
25117cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
25127cf1b8d3SKris Buschelman   */
25137cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
25147cf1b8d3SKris Buschelman 
25151ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25177cf1b8d3SKris Buschelman   {
25187cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
25197cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
25207cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
25217cf1b8d3SKris Buschelman     int       jdx,idx;
25227cf1b8d3SKris Buschelman     int       *vi;
25237cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
25247cf1b8d3SKris Buschelman 
25257cf1b8d3SKris Buschelman     /* First block is the identity. */
25267cf1b8d3SKris Buschelman     idx  = 0;
25277cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
25287cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
25297cf1b8d3SKris Buschelman 
25307cf1b8d3SKris Buschelman     for (i=1; i<n;) {
25317cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25327cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
25337cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
25347cf1b8d3SKris Buschelman       idx +=  4;
25357cf1b8d3SKris Buschelman 
25367cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
25377cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
25387cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
25397cf1b8d3SKris Buschelman 
25407cf1b8d3SKris Buschelman       while (nz--) {
25417cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25427cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
25437cf1b8d3SKris Buschelman /*          jdx = *vi++; */
25447cf1b8d3SKris Buschelman 
25457cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
25467cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
25477cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25487cf1b8d3SKris Buschelman 
25497cf1b8d3SKris Buschelman           /* First Column */
25507cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25517cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25527cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25537cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25547cf1b8d3SKris Buschelman 
25557cf1b8d3SKris Buschelman           /* Second Column */
25567cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25577cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25587cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25597cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25607cf1b8d3SKris Buschelman 
25617cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25627cf1b8d3SKris Buschelman 
25637cf1b8d3SKris Buschelman           /* Third Column */
25647cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25657cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25667cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25677cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25687cf1b8d3SKris Buschelman 
25697cf1b8d3SKris Buschelman           /* Fourth Column */
25707cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25717cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25727cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25737cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25747cf1b8d3SKris Buschelman         SSE_INLINE_END_2
25757cf1b8d3SKris Buschelman 
25767cf1b8d3SKris Buschelman         v  += 16;
25777cf1b8d3SKris Buschelman       }
25787cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
25797cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
25807cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
25817cf1b8d3SKris Buschelman     }
25827cf1b8d3SKris Buschelman 
25837cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
25847cf1b8d3SKris Buschelman 
25857cf1b8d3SKris Buschelman     idt  = 4*(n-1);
25867cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
25877cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
25887cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
25897cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
25907cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
25917cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
25927cf1b8d3SKris Buschelman 
25937cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
25947cf1b8d3SKris Buschelman 
25957cf1b8d3SKris Buschelman       while (nz--) {
25967cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
25977cf1b8d3SKris Buschelman         idx = 4*(*vi++);
25987cf1b8d3SKris Buschelman /*          idx = *vi++; */
25997cf1b8d3SKris Buschelman 
26007cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
26017cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
26027cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26037cf1b8d3SKris Buschelman 
26047cf1b8d3SKris Buschelman           /* First Column */
26057cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26067cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26077cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26087cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26097cf1b8d3SKris Buschelman 
26107cf1b8d3SKris Buschelman           /* Second Column */
26117cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26127cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26137cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26147cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26157cf1b8d3SKris Buschelman 
26167cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26177cf1b8d3SKris Buschelman 
26187cf1b8d3SKris Buschelman           /* Third Column */
26197cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26207cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26217cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26227cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26237cf1b8d3SKris Buschelman 
26247cf1b8d3SKris Buschelman           /* Fourth Column */
26257cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26267cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26277cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26287cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26297cf1b8d3SKris Buschelman         SSE_INLINE_END_2
26307cf1b8d3SKris Buschelman         v  += 16;
26317cf1b8d3SKris Buschelman       }
26327cf1b8d3SKris Buschelman       v    = aa + ai16;
26337cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
26347cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
26357cf1b8d3SKris Buschelman       /*
26367cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
26377cf1b8d3SKris Buschelman          which was inverted as part of the factorization
26387cf1b8d3SKris Buschelman       */
26397cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
26407cf1b8d3SKris Buschelman         /* First Column */
26417cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
26427cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
26437cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
26447cf1b8d3SKris Buschelman 
26457cf1b8d3SKris Buschelman         /* Second Column */
26467cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
26477cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
26487cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
26497cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
26507cf1b8d3SKris Buschelman 
26517cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
26527cf1b8d3SKris Buschelman 
26537cf1b8d3SKris Buschelman         /* Third Column */
26547cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
26557cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
26567cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
26577cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
26587cf1b8d3SKris Buschelman 
26597cf1b8d3SKris Buschelman         /* Fourth Column */
26607cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
26617cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
26627cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
26637cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
26647cf1b8d3SKris Buschelman 
26657cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
26667cf1b8d3SKris Buschelman       SSE_INLINE_END_3
26677cf1b8d3SKris Buschelman 
26687cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
26697cf1b8d3SKris Buschelman       idt -= 4;
26707cf1b8d3SKris Buschelman     }
26717cf1b8d3SKris Buschelman 
26727cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
26737cf1b8d3SKris Buschelman     idt = 4*(n-1);
26747cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
26757cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
26767cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
26777cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
26787cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
26797cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
26807cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
26817cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
26827cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
26837cf1b8d3SKris Buschelman       idt -= 4;
26847cf1b8d3SKris Buschelman     }
26857cf1b8d3SKris Buschelman 
26867cf1b8d3SKris Buschelman   } /* End of artificial scope. */
26871ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
26881ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2689d0f46423SBarry Smith   ierr = PetscLogFlops(2*16*(a->nz) - 4*A->cmap->n);CHKERRQ(ierr);
26907cf1b8d3SKris Buschelman   SSE_SCOPE_END;
26917cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
26927cf1b8d3SKris Buschelman }
26937cf1b8d3SKris Buschelman 
26943660e330SKris Buschelman #endif
26954a2ae208SSatish Balay #undef __FUNCT__
26964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2697dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
26984e2b4712SSatish Balay {
26994e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
27004e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
27016849ba73SBarry Smith   PetscErrorCode    ierr;
2702*5d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2703*5d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2704d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2705d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2706d9fead3dSBarry Smith   const PetscScalar *b;
27074e2b4712SSatish Balay 
27084e2b4712SSatish Balay   PetscFunctionBegin;
2709d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2711f1af5d2fSBarry Smith   t  = a->solve_work;
27124e2b4712SSatish Balay 
27134e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
27144e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
27154e2b4712SSatish Balay 
27164e2b4712SSatish Balay   /* forward solve the lower triangular */
27174e2b4712SSatish Balay   idx    = 3*(*r++);
2718f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
27194e2b4712SSatish Balay   for (i=1; i<n; i++) {
27204e2b4712SSatish Balay     v     = aa + 9*ai[i];
27214e2b4712SSatish Balay     vi    = aj + ai[i];
27224e2b4712SSatish Balay     nz    = diag[i] - ai[i];
27234e2b4712SSatish Balay     idx   = 3*(*r++);
2724f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
27254e2b4712SSatish Balay     while (nz--) {
27264e2b4712SSatish Balay       idx   = 3*(*vi++);
2727f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2728f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2729f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2730f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27314e2b4712SSatish Balay       v += 9;
27324e2b4712SSatish Balay     }
27334e2b4712SSatish Balay     idx = 3*i;
2734f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
27354e2b4712SSatish Balay   }
27364e2b4712SSatish Balay   /* backward solve the upper triangular */
27374e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
27384e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
27394e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
27404e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
27414e2b4712SSatish Balay     idt  = 3*i;
2742f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
27434e2b4712SSatish Balay     while (nz--) {
27444e2b4712SSatish Balay       idx   = 3*(*vi++);
2745f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2746f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2747f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2748f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
27494e2b4712SSatish Balay       v += 9;
27504e2b4712SSatish Balay     }
27514e2b4712SSatish Balay     idc = 3*(*c--);
27524e2b4712SSatish Balay     v   = aa + 9*diag[i];
2753f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2754f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2755f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
27564e2b4712SSatish Balay   }
27574e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
27584e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2759d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2761d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
27624e2b4712SSatish Balay   PetscFunctionReturn(0);
27634e2b4712SSatish Balay }
27644e2b4712SSatish Balay 
276515091d37SBarry Smith /*
276615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
276715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
276815091d37SBarry Smith */
27694a2ae208SSatish Balay #undef __FUNCT__
27704a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2771dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
277215091d37SBarry Smith {
277315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2774690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2775dfbe8321SBarry Smith   PetscErrorCode    ierr;
2776690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2777d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2778d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2779d9fead3dSBarry Smith   const PetscScalar *b;
2780690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
278115091d37SBarry Smith 
278215091d37SBarry Smith   PetscFunctionBegin;
2783d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27841ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
278515091d37SBarry Smith 
278615091d37SBarry Smith   /* forward solve the lower triangular */
278715091d37SBarry Smith   idx    = 0;
278815091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
278915091d37SBarry Smith   for (i=1; i<n; i++) {
279015091d37SBarry Smith     v     =  aa      + 9*ai[i];
279115091d37SBarry Smith     vi    =  aj      + ai[i];
279215091d37SBarry Smith     nz    =  diag[i] - ai[i];
279315091d37SBarry Smith     idx   +=  3;
2794f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
279515091d37SBarry Smith     while (nz--) {
279615091d37SBarry Smith       jdx   = 3*(*vi++);
279715091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2798f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2799f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2800f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
280115091d37SBarry Smith       v    += 9;
280215091d37SBarry Smith     }
2803f1af5d2fSBarry Smith     x[idx]   = s1;
2804f1af5d2fSBarry Smith     x[1+idx] = s2;
2805f1af5d2fSBarry Smith     x[2+idx] = s3;
280615091d37SBarry Smith   }
280715091d37SBarry Smith   /* backward solve the upper triangular */
280815091d37SBarry Smith   for (i=n-1; i>=0; i--){
280915091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
281015091d37SBarry Smith     vi   = aj + diag[i] + 1;
281115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
281215091d37SBarry Smith     idt  = 3*i;
2813f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2814f1af5d2fSBarry Smith     s3 = x[2+idt];
281515091d37SBarry Smith     while (nz--) {
281615091d37SBarry Smith       idx   = 3*(*vi++);
281715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2818f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2819f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2820f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
282115091d37SBarry Smith       v    += 9;
282215091d37SBarry Smith     }
282315091d37SBarry Smith     v        = aa +  9*diag[i];
2824f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2825f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2826f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
282715091d37SBarry Smith   }
282815091d37SBarry Smith 
2829d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28301ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2831d0f46423SBarry Smith   ierr = PetscLogFlops(2*9*(a->nz) - 3*A->cmap->n);CHKERRQ(ierr);
283215091d37SBarry Smith   PetscFunctionReturn(0);
283315091d37SBarry Smith }
283415091d37SBarry Smith 
28354a2ae208SSatish Balay #undef __FUNCT__
28364a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2837dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
28384e2b4712SSatish Balay {
28394e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
28404e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
28416849ba73SBarry Smith   PetscErrorCode    ierr;
2842*5d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2843*5d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2844d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2845d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
2846d9fead3dSBarry Smith   const PetscScalar *b;
28474e2b4712SSatish Balay 
28484e2b4712SSatish Balay   PetscFunctionBegin;
2849d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28501ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2851f1af5d2fSBarry Smith   t  = a->solve_work;
28524e2b4712SSatish Balay 
28534e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28544e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28554e2b4712SSatish Balay 
28564e2b4712SSatish Balay   /* forward solve the lower triangular */
28574e2b4712SSatish Balay   idx    = 2*(*r++);
2858f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
28594e2b4712SSatish Balay   for (i=1; i<n; i++) {
28604e2b4712SSatish Balay     v     = aa + 4*ai[i];
28614e2b4712SSatish Balay     vi    = aj + ai[i];
28624e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28634e2b4712SSatish Balay     idx   = 2*(*r++);
2864f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
28654e2b4712SSatish Balay     while (nz--) {
28664e2b4712SSatish Balay       idx   = 2*(*vi++);
2867f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2868f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2869f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28704e2b4712SSatish Balay       v += 4;
28714e2b4712SSatish Balay     }
28724e2b4712SSatish Balay     idx = 2*i;
2873f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
28744e2b4712SSatish Balay   }
28754e2b4712SSatish Balay   /* backward solve the upper triangular */
28764e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28774e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
28784e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28794e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28804e2b4712SSatish Balay     idt  = 2*i;
2881f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
28824e2b4712SSatish Balay     while (nz--) {
28834e2b4712SSatish Balay       idx   = 2*(*vi++);
2884f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2885f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2886f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
28874e2b4712SSatish Balay       v += 4;
28884e2b4712SSatish Balay     }
28894e2b4712SSatish Balay     idc = 2*(*c--);
28904e2b4712SSatish Balay     v   = aa + 4*diag[i];
2891f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2892f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
28934e2b4712SSatish Balay   }
28944e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28954e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2896d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2898d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
28994e2b4712SSatish Balay   PetscFunctionReturn(0);
29004e2b4712SSatish Balay }
29014e2b4712SSatish Balay 
290215091d37SBarry Smith /*
290315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
290415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
290515091d37SBarry Smith */
29064a2ae208SSatish Balay #undef __FUNCT__
29074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2908dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
290915091d37SBarry Smith {
291015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2911690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2912dfbe8321SBarry Smith   PetscErrorCode    ierr;
2913690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2914d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2915d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
2916d9fead3dSBarry Smith   const PetscScalar *b;
2917690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
291815091d37SBarry Smith 
291915091d37SBarry Smith   PetscFunctionBegin;
2920d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
292215091d37SBarry Smith 
292315091d37SBarry Smith   /* forward solve the lower triangular */
292415091d37SBarry Smith   idx    = 0;
292515091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
292615091d37SBarry Smith   for (i=1; i<n; i++) {
292715091d37SBarry Smith     v     =  aa      + 4*ai[i];
292815091d37SBarry Smith     vi    =  aj      + ai[i];
292915091d37SBarry Smith     nz    =  diag[i] - ai[i];
293015091d37SBarry Smith     idx   +=  2;
2931f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
293215091d37SBarry Smith     while (nz--) {
293315091d37SBarry Smith       jdx   = 2*(*vi++);
293415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
2935f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2936f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
293715091d37SBarry Smith       v    += 4;
293815091d37SBarry Smith     }
2939f1af5d2fSBarry Smith     x[idx]   = s1;
2940f1af5d2fSBarry Smith     x[1+idx] = s2;
294115091d37SBarry Smith   }
294215091d37SBarry Smith   /* backward solve the upper triangular */
294315091d37SBarry Smith   for (i=n-1; i>=0; i--){
294415091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
294515091d37SBarry Smith     vi   = aj + diag[i] + 1;
294615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
294715091d37SBarry Smith     idt  = 2*i;
2948f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
294915091d37SBarry Smith     while (nz--) {
295015091d37SBarry Smith       idx   = 2*(*vi++);
295115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
2952f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2953f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
295415091d37SBarry Smith       v    += 4;
295515091d37SBarry Smith     }
295615091d37SBarry Smith     v        = aa +  4*diag[i];
2957f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
2958f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
295915091d37SBarry Smith   }
296015091d37SBarry Smith 
2961d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29621ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2963d0f46423SBarry Smith   ierr = PetscLogFlops(2*4*(a->nz) - 2*A->cmap->n);CHKERRQ(ierr);
296415091d37SBarry Smith   PetscFunctionReturn(0);
296515091d37SBarry Smith }
296615091d37SBarry Smith 
29674a2ae208SSatish Balay #undef __FUNCT__
29684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
2969dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
29704e2b4712SSatish Balay {
29714e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
29724e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
29736849ba73SBarry Smith   PetscErrorCode ierr;
2974*5d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
2975*5d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
29763f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
297787828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
29784e2b4712SSatish Balay 
29794e2b4712SSatish Balay   PetscFunctionBegin;
29804e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
29814e2b4712SSatish Balay 
29821ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
29831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2984f1af5d2fSBarry Smith   t  = a->solve_work;
29854e2b4712SSatish Balay 
29864e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29874e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29884e2b4712SSatish Balay 
29894e2b4712SSatish Balay   /* forward solve the lower triangular */
2990f1af5d2fSBarry Smith   t[0] = b[*r++];
29914e2b4712SSatish Balay   for (i=1; i<n; i++) {
29924e2b4712SSatish Balay     v     = aa + ai[i];
29934e2b4712SSatish Balay     vi    = aj + ai[i];
29944e2b4712SSatish Balay     nz    = diag[i] - ai[i];
2995f1af5d2fSBarry Smith     s1  = b[*r++];
29964e2b4712SSatish Balay     while (nz--) {
2997f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
29984e2b4712SSatish Balay     }
2999f1af5d2fSBarry Smith     t[i] = s1;
30004e2b4712SSatish Balay   }
30014e2b4712SSatish Balay   /* backward solve the upper triangular */
30024e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30034e2b4712SSatish Balay     v    = aa + diag[i] + 1;
30044e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30054e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3006f1af5d2fSBarry Smith     s1 = t[i];
30074e2b4712SSatish Balay     while (nz--) {
3008f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
30094e2b4712SSatish Balay     }
3010f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
30114e2b4712SSatish Balay   }
30124e2b4712SSatish Balay 
30134e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30144e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30151ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3017d0f46423SBarry Smith   ierr = PetscLogFlops(2*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
30184e2b4712SSatish Balay   PetscFunctionReturn(0);
30194e2b4712SSatish Balay }
302015091d37SBarry Smith /*
302115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
302215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
302315091d37SBarry Smith */
30244a2ae208SSatish Balay #undef __FUNCT__
30254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3026dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
302715091d37SBarry Smith {
302815091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3029690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3030dfbe8321SBarry Smith   PetscErrorCode ierr;
3031690b6cddSBarry Smith   PetscInt       *diag = a->diag;
303215091d37SBarry Smith   MatScalar      *aa=a->a;
303387828ca2SBarry Smith   PetscScalar    *x,*b;
303487828ca2SBarry Smith   PetscScalar    s1,x1;
303515091d37SBarry Smith   MatScalar      *v;
3036690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
303715091d37SBarry Smith 
303815091d37SBarry Smith   PetscFunctionBegin;
30391ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
304115091d37SBarry Smith 
304215091d37SBarry Smith   /* forward solve the lower triangular */
304315091d37SBarry Smith   idx    = 0;
304415091d37SBarry Smith   x[0]   = b[0];
304515091d37SBarry Smith   for (i=1; i<n; i++) {
304615091d37SBarry Smith     v     =  aa      + ai[i];
304715091d37SBarry Smith     vi    =  aj      + ai[i];
304815091d37SBarry Smith     nz    =  diag[i] - ai[i];
304915091d37SBarry Smith     idx   +=  1;
3050f1af5d2fSBarry Smith     s1  =  b[idx];
305115091d37SBarry Smith     while (nz--) {
305215091d37SBarry Smith       jdx   = *vi++;
305315091d37SBarry Smith       x1    = x[jdx];
3054f1af5d2fSBarry Smith       s1 -= v[0]*x1;
305515091d37SBarry Smith       v    += 1;
305615091d37SBarry Smith     }
3057f1af5d2fSBarry Smith     x[idx]   = s1;
305815091d37SBarry Smith   }
305915091d37SBarry Smith   /* backward solve the upper triangular */
306015091d37SBarry Smith   for (i=n-1; i>=0; i--){
306115091d37SBarry Smith     v    = aa + diag[i] + 1;
306215091d37SBarry Smith     vi   = aj + diag[i] + 1;
306315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
306415091d37SBarry Smith     idt  = i;
3065f1af5d2fSBarry Smith     s1 = x[idt];
306615091d37SBarry Smith     while (nz--) {
306715091d37SBarry Smith       idx   = *vi++;
306815091d37SBarry Smith       x1    = x[idx];
3069f1af5d2fSBarry Smith       s1 -= v[0]*x1;
307015091d37SBarry Smith       v    += 1;
307115091d37SBarry Smith     }
307215091d37SBarry Smith     v        = aa +  diag[i];
3073f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
307415091d37SBarry Smith   }
30751ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30761ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3077d0f46423SBarry Smith   ierr = PetscLogFlops(2*(a->nz) - A->cmap->n);CHKERRQ(ierr);
307815091d37SBarry Smith   PetscFunctionReturn(0);
307915091d37SBarry Smith }
30804e2b4712SSatish Balay 
30814e2b4712SSatish Balay /* ----------------------------------------------------------------*/
30824e2b4712SSatish Balay /*
30834e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
30844e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
30854e2b4712SSatish Balay    Not a good example of code reuse.
30864e2b4712SSatish Balay */
3087719d5645SBarry Smith EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
3088db4efbfdSBarry Smith EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
3089435faa5fSBarry Smith 
30904a2ae208SSatish Balay #undef __FUNCT__
30914a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
3092719d5645SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,MatFactorInfo *info)
30934e2b4712SSatish Balay {
30944e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
30954e2b4712SSatish Balay   IS             isicol;
30966849ba73SBarry Smith   PetscErrorCode ierr;
3097*5d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
3098*5d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3099a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3100d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
310141df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
3102329f5518SBarry Smith   PetscReal      f;
31034e2b4712SSatish Balay 
31044e2b4712SSatish Balay   PetscFunctionBegin;
3105435faa5fSBarry Smith   f             = info->fill;
3106690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
3107690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
31084c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3109667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3110667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
311141df41f0SMatthew Knepley   both_identity = row_identity && col_identity;
3112309c388cSBarry Smith 
311341df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
3114719d5645SBarry Smith     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
3115719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
3116719d5645SBarry Smith     b               = (Mat_SeqBAIJ*)(fact)->data;
3117719d5645SBarry Smith     ierr = MatMissingDiagonal_SeqBAIJ(fact,&flg,&dd);CHKERRQ(ierr);
31182af78befSBarry Smith     if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry in row %D",dd);
3119bb3d539aSBarry Smith     b->row        = isrow;
3120bb3d539aSBarry Smith     b->col        = iscol;
3121bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3122bb3d539aSBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3123bb3d539aSBarry Smith     b->icol       = isicol;
3124bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3125719d5645SBarry Smith     ierr          = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
3126309c388cSBarry Smith   } else { /* general case perform the symbolic factorization */
31274e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
31284e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
31294e2b4712SSatish Balay 
31304e2b4712SSatish Balay     /* get new row pointers */
3131690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
31324e2b4712SSatish Balay     ainew[0] = 0;
31334e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
3134690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
3135690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
31364e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
3137690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
31384e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3139690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
31404e2b4712SSatish Balay     /* im is level for each filled value */
3141690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
31424e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3143690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
31444e2b4712SSatish Balay     dloc[0]  = 0;
31454e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3146435faa5fSBarry Smith 
3147435faa5fSBarry Smith       /* copy prow into linked list */
31484e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
314929bbc08cSBarry Smith       if (!nz) SETERRQ(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix");
31504e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
31514e2b4712SSatish Balay       fill[n]    = n;
3152435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
31534e2b4712SSatish Balay       while (nz--) {
31544e2b4712SSatish Balay 	fm  = n;
31554e2b4712SSatish Balay 	idx = ic[*xi++];
31564e2b4712SSatish Balay 	do {
31574e2b4712SSatish Balay 	  m  = fm;
31584e2b4712SSatish Balay 	  fm = fill[m];
31594e2b4712SSatish Balay 	} while (fm < idx);
31604e2b4712SSatish Balay 	fill[m]   = idx;
31614e2b4712SSatish Balay 	fill[idx] = fm;
31624e2b4712SSatish Balay 	im[idx]   = 0;
31634e2b4712SSatish Balay       }
3164435faa5fSBarry Smith 
3165435faa5fSBarry Smith       /* make sure diagonal entry is included */
3166435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3167435faa5fSBarry Smith 	fm = n;
3168435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3169435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3170435faa5fSBarry Smith 	fill[fm]   = prow;
3171435faa5fSBarry Smith 	im[prow]   = 0;
3172435faa5fSBarry Smith 	nzf++;
3173335d9088SBarry Smith 	dcount++;
3174435faa5fSBarry Smith       }
3175435faa5fSBarry Smith 
31764e2b4712SSatish Balay       nzi = 0;
31774e2b4712SSatish Balay       row = fill[n];
31784e2b4712SSatish Balay       while (row < prow) {
31794e2b4712SSatish Balay 	incrlev = im[row] + 1;
31804e2b4712SSatish Balay 	nz      = dloc[row];
3181435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
31824e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
31834e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
31844e2b4712SSatish Balay 	fm      = row;
31854e2b4712SSatish Balay 	while (nnz-- > 0) {
31864e2b4712SSatish Balay 	  idx = *xi++;
31874e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
31884e2b4712SSatish Balay 	    flev++;
31894e2b4712SSatish Balay 	    continue;
31904e2b4712SSatish Balay 	  }
31914e2b4712SSatish Balay 	  do {
31924e2b4712SSatish Balay 	    m  = fm;
31934e2b4712SSatish Balay 	    fm = fill[m];
31944e2b4712SSatish Balay 	  } while (fm < idx);
31954e2b4712SSatish Balay 	  if (fm != idx) {
31964e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
31974e2b4712SSatish Balay 	    fill[m]   = idx;
31984e2b4712SSatish Balay 	    fill[idx] = fm;
31994e2b4712SSatish Balay 	    fm        = idx;
32004e2b4712SSatish Balay 	    nzf++;
3201ecf371e4SBarry Smith 	  } else {
32024e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
32034e2b4712SSatish Balay 	  }
32044e2b4712SSatish Balay 	  flev++;
32054e2b4712SSatish Balay 	}
32064e2b4712SSatish Balay 	row = fill[row];
32074e2b4712SSatish Balay 	nzi++;
32084e2b4712SSatish Balay       }
32094e2b4712SSatish Balay       /* copy new filled row into permanent storage */
32104e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
32114e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3212ecf371e4SBarry Smith 
3213ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3214ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3215ecf371e4SBarry Smith 	/* just double the memory each time */
3216690b6cddSBarry Smith 	PetscInt maxadd = jmax;
3217ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
32184e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
32194e2b4712SSatish Balay 	jmax += maxadd;
3220ecf371e4SBarry Smith 
3221ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
3222*5d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3223*5d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3224606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
3225*5d0c19d7SBarry Smith 	ajnew = xitmp;
3226*5d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
3227*5d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3228606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
3229*5d0c19d7SBarry Smith 	ajfill = xitmp;
3230eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
32314e2b4712SSatish Balay       }
3232*5d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
32334e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
32344e2b4712SSatish Balay       dloc[prow]  = nzi;
32354e2b4712SSatish Balay       fm          = fill[n];
32364e2b4712SSatish Balay       while (nzf--) {
3237*5d0c19d7SBarry Smith 	*xitmp++ = fm;
32384e2b4712SSatish Balay 	*flev++ = im[fm];
32394e2b4712SSatish Balay 	fm      = fill[fm];
32404e2b4712SSatish Balay       }
3241435faa5fSBarry Smith       /* make sure row has diagonal entry */
3242435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
324377431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
32442401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3245435faa5fSBarry Smith       }
32464e2b4712SSatish Balay     }
3247606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
32484e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
32494e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3250606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3251606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
32524e2b4712SSatish Balay 
32536cf91177SBarry Smith #if defined(PETSC_USE_INFO)
32544e2b4712SSatish Balay     {
3255329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3256ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3257ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3258ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3259ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3260335d9088SBarry Smith       if (diagonal_fill) {
3261ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3262335d9088SBarry Smith       }
32634e2b4712SSatish Balay     }
326463ba0a88SBarry Smith #endif
32654e2b4712SSatish Balay 
32664e2b4712SSatish Balay     /* put together the new matrix */
3267719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3268719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3269719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
3270e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
3271e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
32727c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
3273a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
32744e2b4712SSatish Balay     b->j          = ajnew;
32754e2b4712SSatish Balay     b->i          = ainew;
32764e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
32774e2b4712SSatish Balay     b->diag       = dloc;
32784e2b4712SSatish Balay     b->ilen       = 0;
32794e2b4712SSatish Balay     b->imax       = 0;
32804e2b4712SSatish Balay     b->row        = isrow;
32814e2b4712SSatish Balay     b->col        = iscol;
3282bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3283c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3284c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3285e51c0b9cSSatish Balay     b->icol       = isicol;
328687828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
32874e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
32884e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
3289719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
32904e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
32914e2b4712SSatish Balay 
3292719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
3293719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
3294719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
3295309c388cSBarry Smith   }
329641df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
32978661488fSKris Buschelman   PetscFunctionReturn(0);
32988661488fSKris Buschelman }
32998661488fSKris Buschelman 
3300732ee342SKris Buschelman #undef __FUNCT__
33017e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3302dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
33037e7071cdSKris Buschelman {
330412272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
330512272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
33065a9542e3SKris Buschelman   PetscFunctionBegin;
33077cf1b8d3SKris Buschelman   /* Undo Column scaling */
33087cf1b8d3SKris Buschelman /*    while (nz--) { */
33097cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
33107cf1b8d3SKris Buschelman /*    } */
3311c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3312c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
33137cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
33147cf1b8d3SKris Buschelman }
33157cf1b8d3SKris Buschelman 
33167cf1b8d3SKris Buschelman #undef __FUNCT__
33177cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3318dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
33197cf1b8d3SKris Buschelman {
33207cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3321b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
33222aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
33235a9542e3SKris Buschelman   PetscFunctionBegin;
33240b9da03eSKris Buschelman   /* Is this really necessary? */
332520235379SKris Buschelman   while (nz--) {
33260b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
33277e7071cdSKris Buschelman   }
3328c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
33297e7071cdSKris Buschelman   PetscFunctionReturn(0);
33307e7071cdSKris Buschelman }
33317e7071cdSKris Buschelman 
3332732ee342SKris Buschelman 
3333