xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 84a281e5cf08a828aba00cbac6399c8de1877e74)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
34e2b4712SSatish Balay /*
44e2b4712SSatish Balay     Factorization code for BAIJ format.
54e2b4712SSatish Balay */
64e2b4712SSatish Balay 
77c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
87c4f633dSBarry Smith #include "../src/inline/ilu.h"
97c4f633dSBarry Smith #include "../src/inline/dot.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16dfbe8321SBarry Smith   PetscErrorCode ierr;
17690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18690b6cddSBarry Smith   PetscInt       *diag = a->diag;
19f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2087828ca2SBarry Smith   PetscScalar    s1,*x,*b;
21f1af5d2fSBarry Smith 
22f1af5d2fSBarry Smith   PetscFunctionBegin;
23ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26f1af5d2fSBarry Smith 
27f1af5d2fSBarry Smith   /* forward solve the U^T */
28f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
29f1af5d2fSBarry Smith 
30f1af5d2fSBarry Smith     v     = aa + diag[i];
31f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
32ef66eb69SBarry Smith     s1    = (*v++)*x[i];
33f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
34f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
35f1af5d2fSBarry Smith     while (nz--) {
36f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
37f1af5d2fSBarry Smith     }
38f1af5d2fSBarry Smith     x[i]   = s1;
39f1af5d2fSBarry Smith   }
40f1af5d2fSBarry Smith   /* backward solve the L^T */
41f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
42f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
43f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
44f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
45f1af5d2fSBarry Smith     s1   = x[i];
46f1af5d2fSBarry Smith     while (nz--) {
47f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
48f1af5d2fSBarry Smith     }
49f1af5d2fSBarry Smith   }
501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53f1af5d2fSBarry Smith   PetscFunctionReturn(0);
54f1af5d2fSBarry Smith }
55f1af5d2fSBarry Smith 
564a2ae208SSatish Balay #undef __FUNCT__
574a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
58dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
59f1af5d2fSBarry Smith {
60f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
61dfbe8321SBarry Smith   PetscErrorCode ierr;
62690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
63690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
64f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6587828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6687828ca2SBarry Smith   PetscScalar    *x,*b;
67f1af5d2fSBarry Smith 
68f1af5d2fSBarry Smith   PetscFunctionBegin;
69ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
701ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
72f1af5d2fSBarry Smith 
73f1af5d2fSBarry Smith   /* forward solve the U^T */
74f1af5d2fSBarry Smith   idx = 0;
75f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
76f1af5d2fSBarry Smith 
77f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
78f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
79ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
80f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
81f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
82f1af5d2fSBarry Smith     v += 4;
83f1af5d2fSBarry Smith 
84f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
85f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
86f1af5d2fSBarry Smith     while (nz--) {
87f1af5d2fSBarry Smith       oidx = 2*(*vi++);
88f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
89f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
90f1af5d2fSBarry Smith       v  += 4;
91f1af5d2fSBarry Smith     }
92f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
93f1af5d2fSBarry Smith     idx += 2;
94f1af5d2fSBarry Smith   }
95f1af5d2fSBarry Smith   /* backward solve the L^T */
96f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
97f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
98f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
99f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
100f1af5d2fSBarry Smith     idt  = 2*i;
101f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
102f1af5d2fSBarry Smith     while (nz--) {
103f1af5d2fSBarry Smith       idx   = 2*(*vi--);
104f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
105f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
106f1af5d2fSBarry Smith       v -= 4;
107f1af5d2fSBarry Smith     }
108f1af5d2fSBarry Smith   }
1091ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
111dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
112f1af5d2fSBarry Smith   PetscFunctionReturn(0);
113f1af5d2fSBarry Smith }
114f1af5d2fSBarry Smith 
1154a2ae208SSatish Balay #undef __FUNCT__
1164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
117dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
118f1af5d2fSBarry Smith {
119f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
120dfbe8321SBarry Smith   PetscErrorCode ierr;
121690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
122690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
123f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12487828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12587828ca2SBarry Smith   PetscScalar    *x,*b;
126f1af5d2fSBarry Smith 
127f1af5d2fSBarry Smith   PetscFunctionBegin;
128ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1291ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
131f1af5d2fSBarry Smith 
132f1af5d2fSBarry Smith   /* forward solve the U^T */
133f1af5d2fSBarry Smith   idx = 0;
134f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
135f1af5d2fSBarry Smith 
136f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
137f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
138ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
139f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
140f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
141f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
142f1af5d2fSBarry Smith     v += 9;
143f1af5d2fSBarry Smith 
144f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
145f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
146f1af5d2fSBarry Smith     while (nz--) {
147f1af5d2fSBarry Smith       oidx = 3*(*vi++);
148f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
149f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
150f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
151f1af5d2fSBarry Smith       v  += 9;
152f1af5d2fSBarry Smith     }
153f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
154f1af5d2fSBarry Smith     idx += 3;
155f1af5d2fSBarry Smith   }
156f1af5d2fSBarry Smith   /* backward solve the L^T */
157f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
158f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
159f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
160f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
161f1af5d2fSBarry Smith     idt  = 3*i;
162f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
163f1af5d2fSBarry Smith     while (nz--) {
164f1af5d2fSBarry Smith       idx   = 3*(*vi--);
165f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
166f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
167f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
168f1af5d2fSBarry Smith       v -= 9;
169f1af5d2fSBarry Smith     }
170f1af5d2fSBarry Smith   }
1711ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
173dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
174f1af5d2fSBarry Smith   PetscFunctionReturn(0);
175f1af5d2fSBarry Smith }
176f1af5d2fSBarry Smith 
1774a2ae208SSatish Balay #undef __FUNCT__
1784a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
179dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
180f1af5d2fSBarry Smith {
181f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182dfbe8321SBarry Smith   PetscErrorCode ierr;
183690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
184690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
185f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18787828ca2SBarry Smith   PetscScalar    *x,*b;
188f1af5d2fSBarry Smith 
189f1af5d2fSBarry Smith   PetscFunctionBegin;
190ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith   /* forward solve the U^T */
195f1af5d2fSBarry Smith   idx = 0;
196f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
197f1af5d2fSBarry Smith 
198f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
199f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
200ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
201f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
202f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
203f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
204f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
205f1af5d2fSBarry Smith     v += 16;
206f1af5d2fSBarry Smith 
207f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
208f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
209f1af5d2fSBarry Smith     while (nz--) {
210f1af5d2fSBarry Smith       oidx = 4*(*vi++);
211f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
212f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
213f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
214f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
215f1af5d2fSBarry Smith       v  += 16;
216f1af5d2fSBarry Smith     }
217f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
218f1af5d2fSBarry Smith     idx += 4;
219f1af5d2fSBarry Smith   }
220f1af5d2fSBarry Smith   /* backward solve the L^T */
221f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
222f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
223f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
224f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
225f1af5d2fSBarry Smith     idt  = 4*i;
226f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
227f1af5d2fSBarry Smith     while (nz--) {
228f1af5d2fSBarry Smith       idx   = 4*(*vi--);
229f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
230f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
231f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
232f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
233f1af5d2fSBarry Smith       v -= 16;
234f1af5d2fSBarry Smith     }
235f1af5d2fSBarry Smith   }
2361ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
238dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
239f1af5d2fSBarry Smith   PetscFunctionReturn(0);
240f1af5d2fSBarry Smith }
241f1af5d2fSBarry Smith 
2424a2ae208SSatish Balay #undef __FUNCT__
2434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
244dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
245f1af5d2fSBarry Smith {
246f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
247dfbe8321SBarry Smith   PetscErrorCode ierr;
248690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
249690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
250f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25287828ca2SBarry Smith   PetscScalar    *x,*b;
253f1af5d2fSBarry Smith 
254f1af5d2fSBarry Smith   PetscFunctionBegin;
255ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2561ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2571ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
258f1af5d2fSBarry Smith 
259f1af5d2fSBarry Smith   /* forward solve the U^T */
260f1af5d2fSBarry Smith   idx = 0;
261f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
262f1af5d2fSBarry Smith 
263f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
264f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
265ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
266f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
267f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
268f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
269f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
270f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
271f1af5d2fSBarry Smith     v += 25;
272f1af5d2fSBarry Smith 
273f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
274f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
275f1af5d2fSBarry Smith     while (nz--) {
276f1af5d2fSBarry Smith       oidx = 5*(*vi++);
277f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
278f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
279f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
280f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
281f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
282f1af5d2fSBarry Smith       v  += 25;
283f1af5d2fSBarry Smith     }
284f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
285f1af5d2fSBarry Smith     idx += 5;
286f1af5d2fSBarry Smith   }
287f1af5d2fSBarry Smith   /* backward solve the L^T */
288f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
289f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
290f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
291f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
292f1af5d2fSBarry Smith     idt  = 5*i;
293f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
294f1af5d2fSBarry Smith     while (nz--) {
295f1af5d2fSBarry Smith       idx   = 5*(*vi--);
296f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
297f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
298f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
299f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
300f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
301f1af5d2fSBarry Smith       v -= 25;
302f1af5d2fSBarry Smith     }
303f1af5d2fSBarry Smith   }
3041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
306dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
307f1af5d2fSBarry Smith   PetscFunctionReturn(0);
308f1af5d2fSBarry Smith }
309f1af5d2fSBarry Smith 
3104a2ae208SSatish Balay #undef __FUNCT__
3114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
312dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
313f1af5d2fSBarry Smith {
314f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
315dfbe8321SBarry Smith   PetscErrorCode ierr;
316690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
317690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
318f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
31987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
32087828ca2SBarry Smith   PetscScalar    *x,*b;
321f1af5d2fSBarry Smith 
322f1af5d2fSBarry Smith   PetscFunctionBegin;
323ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
326f1af5d2fSBarry Smith 
327f1af5d2fSBarry Smith   /* forward solve the U^T */
328f1af5d2fSBarry Smith   idx = 0;
329f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
330f1af5d2fSBarry Smith 
331f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
332f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
333ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
334ef66eb69SBarry Smith     x6    = x[5+idx];
335f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
336f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
337f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
338f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
339f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
340f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
341f1af5d2fSBarry Smith     v += 36;
342f1af5d2fSBarry Smith 
343f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
344f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
345f1af5d2fSBarry Smith     while (nz--) {
346f1af5d2fSBarry Smith       oidx = 6*(*vi++);
347f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
348f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
349f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
350f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
351f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
352f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
353f1af5d2fSBarry Smith       v  += 36;
354f1af5d2fSBarry Smith     }
355f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
356f1af5d2fSBarry Smith     x[5+idx] = s6;
357f1af5d2fSBarry Smith     idx += 6;
358f1af5d2fSBarry Smith   }
359f1af5d2fSBarry Smith   /* backward solve the L^T */
360f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
361f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
362f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
363f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
364f1af5d2fSBarry Smith     idt  = 6*i;
365f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
366f1af5d2fSBarry Smith     s6 = x[5+idt];
367f1af5d2fSBarry Smith     while (nz--) {
368f1af5d2fSBarry Smith       idx   = 6*(*vi--);
369f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
370f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
371f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
372f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
373f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
374f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
375f1af5d2fSBarry Smith       v -= 36;
376f1af5d2fSBarry Smith     }
377f1af5d2fSBarry Smith   }
3781ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
380dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
381f1af5d2fSBarry Smith   PetscFunctionReturn(0);
382f1af5d2fSBarry Smith }
383f1af5d2fSBarry Smith 
3844a2ae208SSatish Balay #undef __FUNCT__
3854a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
386dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
387f1af5d2fSBarry Smith {
388f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
389dfbe8321SBarry Smith   PetscErrorCode ierr;
390690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
391690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
392f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39487828ca2SBarry Smith   PetscScalar    *x,*b;
395f1af5d2fSBarry Smith 
396f1af5d2fSBarry Smith   PetscFunctionBegin;
397ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3981ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3991ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
400f1af5d2fSBarry Smith 
401f1af5d2fSBarry Smith   /* forward solve the U^T */
402f1af5d2fSBarry Smith   idx = 0;
403f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
404f1af5d2fSBarry Smith 
405f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
406f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
407ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
408ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
409f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
410f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
411f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
412f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
413f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
414f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
415f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
416f1af5d2fSBarry Smith     v += 49;
417f1af5d2fSBarry Smith 
418f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
419f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
420f1af5d2fSBarry Smith     while (nz--) {
421f1af5d2fSBarry Smith       oidx = 7*(*vi++);
422f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
423f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
424f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
425f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
426f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
427f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
428f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
429f1af5d2fSBarry Smith       v  += 49;
430f1af5d2fSBarry Smith     }
431f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
432f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
433f1af5d2fSBarry Smith     idx += 7;
434f1af5d2fSBarry Smith   }
435f1af5d2fSBarry Smith   /* backward solve the L^T */
436f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
437f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
438f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
439f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
440f1af5d2fSBarry Smith     idt  = 7*i;
441f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
442f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
443f1af5d2fSBarry Smith     while (nz--) {
444f1af5d2fSBarry Smith       idx   = 7*(*vi--);
445f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
446f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
447f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
448f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
449f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
450f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
451f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
452f1af5d2fSBarry Smith       v -= 49;
453f1af5d2fSBarry Smith     }
454f1af5d2fSBarry Smith   }
4551ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
457dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
458f1af5d2fSBarry Smith   PetscFunctionReturn(0);
459f1af5d2fSBarry Smith }
460f1af5d2fSBarry Smith 
461f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4624a2ae208SSatish Balay #undef __FUNCT__
4634a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
464dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
465f1af5d2fSBarry Smith {
466f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
467f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4686849ba73SBarry Smith   PetscErrorCode ierr;
4695d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4705d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
471690b6cddSBarry Smith   PetscInt       *diag = a->diag;
472f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47387828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
474f1af5d2fSBarry Smith 
475f1af5d2fSBarry Smith   PetscFunctionBegin;
4761ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
478f1af5d2fSBarry Smith   t  = a->solve_work;
479f1af5d2fSBarry Smith 
480f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
481f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
482f1af5d2fSBarry Smith 
483f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
484f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
485f1af5d2fSBarry Smith     t[i] = b[c[i]];
486f1af5d2fSBarry Smith   }
487f1af5d2fSBarry Smith 
488f1af5d2fSBarry Smith   /* forward solve the U^T */
489f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
490f1af5d2fSBarry Smith 
491f1af5d2fSBarry Smith     v     = aa + diag[i];
492f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
493f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
494f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
495f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
496f1af5d2fSBarry Smith     while (nz--) {
497f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
498f1af5d2fSBarry Smith     }
499f1af5d2fSBarry Smith     t[i]   = s1;
500f1af5d2fSBarry Smith   }
501f1af5d2fSBarry Smith   /* backward solve the L^T */
502f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
503f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
504f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
505f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
506f1af5d2fSBarry Smith     s1   = t[i];
507f1af5d2fSBarry Smith     while (nz--) {
508f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
509f1af5d2fSBarry Smith     }
510f1af5d2fSBarry Smith   }
511f1af5d2fSBarry Smith 
512f1af5d2fSBarry Smith   /* copy t into x according to permutation */
513f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
514f1af5d2fSBarry Smith     x[r[i]]   = t[i];
515f1af5d2fSBarry Smith   }
516f1af5d2fSBarry Smith 
517f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
518f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5191ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
521dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
522f1af5d2fSBarry Smith   PetscFunctionReturn(0);
523f1af5d2fSBarry Smith }
524f1af5d2fSBarry Smith 
5254a2ae208SSatish Balay #undef __FUNCT__
5264a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
527dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
528f1af5d2fSBarry Smith {
529f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
530f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5316849ba73SBarry Smith   PetscErrorCode ierr;
5325d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5335d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
534690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
535f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53687828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
538f1af5d2fSBarry Smith 
539f1af5d2fSBarry Smith   PetscFunctionBegin;
5401ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5411ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
542f1af5d2fSBarry Smith   t  = a->solve_work;
543f1af5d2fSBarry Smith 
544f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
545f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
546f1af5d2fSBarry Smith 
547f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
548f1af5d2fSBarry Smith   ii = 0;
549f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
550f1af5d2fSBarry Smith     ic      = 2*c[i];
551f1af5d2fSBarry Smith     t[ii]   = b[ic];
552f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
553f1af5d2fSBarry Smith     ii += 2;
554f1af5d2fSBarry Smith   }
555f1af5d2fSBarry Smith 
556f1af5d2fSBarry Smith   /* forward solve the U^T */
557f1af5d2fSBarry Smith   idx = 0;
558f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
559f1af5d2fSBarry Smith 
560f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
561f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
562f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
563f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
564f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
565f1af5d2fSBarry Smith     v += 4;
566f1af5d2fSBarry Smith 
567f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
568f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
569f1af5d2fSBarry Smith     while (nz--) {
570f1af5d2fSBarry Smith       oidx = 2*(*vi++);
571f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
572f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
573f1af5d2fSBarry Smith       v  += 4;
574f1af5d2fSBarry Smith     }
575f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
576f1af5d2fSBarry Smith     idx += 2;
577f1af5d2fSBarry Smith   }
578f1af5d2fSBarry Smith   /* backward solve the L^T */
579f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
580f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
581f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
582f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
583f1af5d2fSBarry Smith     idt  = 2*i;
584f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
585f1af5d2fSBarry Smith     while (nz--) {
586f1af5d2fSBarry Smith       idx   = 2*(*vi--);
587f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
588f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
589f1af5d2fSBarry Smith       v -= 4;
590f1af5d2fSBarry Smith     }
591f1af5d2fSBarry Smith   }
592f1af5d2fSBarry Smith 
593f1af5d2fSBarry Smith   /* copy t into x according to permutation */
594f1af5d2fSBarry Smith   ii = 0;
595f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
596f1af5d2fSBarry Smith     ir      = 2*r[i];
597f1af5d2fSBarry Smith     x[ir]   = t[ii];
598f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
599f1af5d2fSBarry Smith     ii += 2;
600f1af5d2fSBarry Smith   }
601f1af5d2fSBarry Smith 
602f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
603f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
606dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
607f1af5d2fSBarry Smith   PetscFunctionReturn(0);
608f1af5d2fSBarry Smith }
609f1af5d2fSBarry Smith 
6104a2ae208SSatish Balay #undef __FUNCT__
6114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
612dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
613f1af5d2fSBarry Smith {
614f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
615f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6166849ba73SBarry Smith   PetscErrorCode ierr;
6175d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6185d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
619690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
620f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62187828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
623f1af5d2fSBarry Smith 
624f1af5d2fSBarry Smith   PetscFunctionBegin;
6251ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6261ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
627f1af5d2fSBarry Smith   t  = a->solve_work;
628f1af5d2fSBarry Smith 
629f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
630f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
631f1af5d2fSBarry Smith 
632f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
633f1af5d2fSBarry Smith   ii = 0;
634f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
635f1af5d2fSBarry Smith     ic      = 3*c[i];
636f1af5d2fSBarry Smith     t[ii]   = b[ic];
637f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
638f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
639f1af5d2fSBarry Smith     ii += 3;
640f1af5d2fSBarry Smith   }
641f1af5d2fSBarry Smith 
642f1af5d2fSBarry Smith   /* forward solve the U^T */
643f1af5d2fSBarry Smith   idx = 0;
644f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
645f1af5d2fSBarry Smith 
646f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
647f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
648f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
649f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
650f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
651f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
652f1af5d2fSBarry Smith     v += 9;
653f1af5d2fSBarry Smith 
654f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
655f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
656f1af5d2fSBarry Smith     while (nz--) {
657f1af5d2fSBarry Smith       oidx = 3*(*vi++);
658f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
659f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
660f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
661f1af5d2fSBarry Smith       v  += 9;
662f1af5d2fSBarry Smith     }
663f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
664f1af5d2fSBarry Smith     idx += 3;
665f1af5d2fSBarry Smith   }
666f1af5d2fSBarry Smith   /* backward solve the L^T */
667f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
668f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
669f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
670f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
671f1af5d2fSBarry Smith     idt  = 3*i;
672f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
673f1af5d2fSBarry Smith     while (nz--) {
674f1af5d2fSBarry Smith       idx   = 3*(*vi--);
675f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
676f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
677f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
678f1af5d2fSBarry Smith       v -= 9;
679f1af5d2fSBarry Smith     }
680f1af5d2fSBarry Smith   }
681f1af5d2fSBarry Smith 
682f1af5d2fSBarry Smith   /* copy t into x according to permutation */
683f1af5d2fSBarry Smith   ii = 0;
684f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
685f1af5d2fSBarry Smith     ir      = 3*r[i];
686f1af5d2fSBarry Smith     x[ir]   = t[ii];
687f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
688f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
689f1af5d2fSBarry Smith     ii += 3;
690f1af5d2fSBarry Smith   }
691f1af5d2fSBarry Smith 
692f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
693f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6941ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
696dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
697f1af5d2fSBarry Smith   PetscFunctionReturn(0);
698f1af5d2fSBarry Smith }
699f1af5d2fSBarry Smith 
7004a2ae208SSatish Balay #undef __FUNCT__
7014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
702dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
703f1af5d2fSBarry Smith {
704f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
705f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7066849ba73SBarry Smith   PetscErrorCode ierr;
7075d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7085d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
709690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
710f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
713f1af5d2fSBarry Smith 
714f1af5d2fSBarry Smith   PetscFunctionBegin;
7151ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7161ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
717f1af5d2fSBarry Smith   t  = a->solve_work;
718f1af5d2fSBarry Smith 
719f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
720f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
721f1af5d2fSBarry Smith 
722f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
723f1af5d2fSBarry Smith   ii = 0;
724f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
725f1af5d2fSBarry Smith     ic      = 4*c[i];
726f1af5d2fSBarry Smith     t[ii]   = b[ic];
727f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
728f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
729f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
730f1af5d2fSBarry Smith     ii += 4;
731f1af5d2fSBarry Smith   }
732f1af5d2fSBarry Smith 
733f1af5d2fSBarry Smith   /* forward solve the U^T */
734f1af5d2fSBarry Smith   idx = 0;
735f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
736f1af5d2fSBarry Smith 
737f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
738f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
739f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
740f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
741f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
742f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
743f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
744f1af5d2fSBarry Smith     v += 16;
745f1af5d2fSBarry Smith 
746f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
747f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
748f1af5d2fSBarry Smith     while (nz--) {
749f1af5d2fSBarry Smith       oidx = 4*(*vi++);
750f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
751f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
752f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
753f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
754f1af5d2fSBarry Smith       v  += 16;
755f1af5d2fSBarry Smith     }
756f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
757f1af5d2fSBarry Smith     idx += 4;
758f1af5d2fSBarry Smith   }
759f1af5d2fSBarry Smith   /* backward solve the L^T */
760f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
761f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
762f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
763f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
764f1af5d2fSBarry Smith     idt  = 4*i;
765f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
766f1af5d2fSBarry Smith     while (nz--) {
767f1af5d2fSBarry Smith       idx   = 4*(*vi--);
768f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
769f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
770f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
771f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
772f1af5d2fSBarry Smith       v -= 16;
773f1af5d2fSBarry Smith     }
774f1af5d2fSBarry Smith   }
775f1af5d2fSBarry Smith 
776f1af5d2fSBarry Smith   /* copy t into x according to permutation */
777f1af5d2fSBarry Smith   ii = 0;
778f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
779f1af5d2fSBarry Smith     ir      = 4*r[i];
780f1af5d2fSBarry Smith     x[ir]   = t[ii];
781f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
782f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
783f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
784f1af5d2fSBarry Smith     ii += 4;
785f1af5d2fSBarry Smith   }
786f1af5d2fSBarry Smith 
787f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
788f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
791dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
792f1af5d2fSBarry Smith   PetscFunctionReturn(0);
793f1af5d2fSBarry Smith }
794f1af5d2fSBarry Smith 
7954a2ae208SSatish Balay #undef __FUNCT__
7964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
797dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
798f1af5d2fSBarry Smith {
799f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
800f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8016849ba73SBarry Smith   PetscErrorCode ierr;
8025d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8035d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
804690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
805f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
808f1af5d2fSBarry Smith 
809f1af5d2fSBarry Smith   PetscFunctionBegin;
8101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
812f1af5d2fSBarry Smith   t  = a->solve_work;
813f1af5d2fSBarry Smith 
814f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
815f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
816f1af5d2fSBarry Smith 
817f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
818f1af5d2fSBarry Smith   ii = 0;
819f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
820f1af5d2fSBarry Smith     ic      = 5*c[i];
821f1af5d2fSBarry Smith     t[ii]   = b[ic];
822f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
823f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
824f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
825f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
826f1af5d2fSBarry Smith     ii += 5;
827f1af5d2fSBarry Smith   }
828f1af5d2fSBarry Smith 
829f1af5d2fSBarry Smith   /* forward solve the U^T */
830f1af5d2fSBarry Smith   idx = 0;
831f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
832f1af5d2fSBarry Smith 
833f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
834f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
835f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
836f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
837f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
838f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
839f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
840f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
841f1af5d2fSBarry Smith     v += 25;
842f1af5d2fSBarry Smith 
843f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
844f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
845f1af5d2fSBarry Smith     while (nz--) {
846f1af5d2fSBarry Smith       oidx = 5*(*vi++);
847f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
848f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
849f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
850f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
851f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
852f1af5d2fSBarry Smith       v  += 25;
853f1af5d2fSBarry Smith     }
854f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
855f1af5d2fSBarry Smith     idx += 5;
856f1af5d2fSBarry Smith   }
857f1af5d2fSBarry Smith   /* backward solve the L^T */
858f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
859f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
860f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
861f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
862f1af5d2fSBarry Smith     idt  = 5*i;
863f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
864f1af5d2fSBarry Smith     while (nz--) {
865f1af5d2fSBarry Smith       idx   = 5*(*vi--);
866f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
867f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
868f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
869f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
870f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
871f1af5d2fSBarry Smith       v -= 25;
872f1af5d2fSBarry Smith     }
873f1af5d2fSBarry Smith   }
874f1af5d2fSBarry Smith 
875f1af5d2fSBarry Smith   /* copy t into x according to permutation */
876f1af5d2fSBarry Smith   ii = 0;
877f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
878f1af5d2fSBarry Smith     ir      = 5*r[i];
879f1af5d2fSBarry Smith     x[ir]   = t[ii];
880f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
881f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
882f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
883f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
884f1af5d2fSBarry Smith     ii += 5;
885f1af5d2fSBarry Smith   }
886f1af5d2fSBarry Smith 
887f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
888f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8891ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
891dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
892f1af5d2fSBarry Smith   PetscFunctionReturn(0);
893f1af5d2fSBarry Smith }
894f1af5d2fSBarry Smith 
8954a2ae208SSatish Balay #undef __FUNCT__
8964a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
897dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
898f1af5d2fSBarry Smith {
899f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
900f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9016849ba73SBarry Smith   PetscErrorCode ierr;
9025d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9035d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
904690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
905f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
908f1af5d2fSBarry Smith 
909f1af5d2fSBarry Smith   PetscFunctionBegin;
9101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
912f1af5d2fSBarry Smith   t  = a->solve_work;
913f1af5d2fSBarry Smith 
914f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
915f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
916f1af5d2fSBarry Smith 
917f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
918f1af5d2fSBarry Smith   ii = 0;
919f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
920f1af5d2fSBarry Smith     ic      = 6*c[i];
921f1af5d2fSBarry Smith     t[ii]   = b[ic];
922f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
923f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
924f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
925f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
926f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
927f1af5d2fSBarry Smith     ii += 6;
928f1af5d2fSBarry Smith   }
929f1af5d2fSBarry Smith 
930f1af5d2fSBarry Smith   /* forward solve the U^T */
931f1af5d2fSBarry Smith   idx = 0;
932f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
933f1af5d2fSBarry Smith 
934f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
935f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
936f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
937f1af5d2fSBarry Smith     x6    = t[5+idx];
938f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
939f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
940f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
941f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
942f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
943f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
944f1af5d2fSBarry Smith     v += 36;
945f1af5d2fSBarry Smith 
946f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
947f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
948f1af5d2fSBarry Smith     while (nz--) {
949f1af5d2fSBarry Smith       oidx = 6*(*vi++);
950f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
951f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
952f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
953f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
954f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
955f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
956f1af5d2fSBarry Smith       v  += 36;
957f1af5d2fSBarry Smith     }
958f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
959f1af5d2fSBarry Smith     t[5+idx] = s6;
960f1af5d2fSBarry Smith     idx += 6;
961f1af5d2fSBarry Smith   }
962f1af5d2fSBarry Smith   /* backward solve the L^T */
963f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
964f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
965f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
966f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
967f1af5d2fSBarry Smith     idt  = 6*i;
968f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
969f1af5d2fSBarry Smith     s6 = t[5+idt];
970f1af5d2fSBarry Smith     while (nz--) {
971f1af5d2fSBarry Smith       idx   = 6*(*vi--);
972f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
973f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
974f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
975f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
976f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
977f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
978f1af5d2fSBarry Smith       v -= 36;
979f1af5d2fSBarry Smith     }
980f1af5d2fSBarry Smith   }
981f1af5d2fSBarry Smith 
982f1af5d2fSBarry Smith   /* copy t into x according to permutation */
983f1af5d2fSBarry Smith   ii = 0;
984f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
985f1af5d2fSBarry Smith     ir      = 6*r[i];
986f1af5d2fSBarry Smith     x[ir]   = t[ii];
987f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
988f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
989f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
990f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
991f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
992f1af5d2fSBarry Smith     ii += 6;
993f1af5d2fSBarry Smith   }
994f1af5d2fSBarry Smith 
995f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
996f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9971ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9981ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
999dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1000f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1001f1af5d2fSBarry Smith }
1002f1af5d2fSBarry Smith 
10034a2ae208SSatish Balay #undef __FUNCT__
10044a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1005dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1006f1af5d2fSBarry Smith {
1007f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1008f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10096849ba73SBarry Smith   PetscErrorCode ierr;
10105d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10115d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1012690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1013f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1016f1af5d2fSBarry Smith 
1017f1af5d2fSBarry Smith   PetscFunctionBegin;
10181ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10191ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1020f1af5d2fSBarry Smith   t  = a->solve_work;
1021f1af5d2fSBarry Smith 
1022f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1023f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1024f1af5d2fSBarry Smith 
1025f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1026f1af5d2fSBarry Smith   ii = 0;
1027f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1028f1af5d2fSBarry Smith     ic      = 7*c[i];
1029f1af5d2fSBarry Smith     t[ii]   = b[ic];
1030f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1031f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1032f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1033f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1034f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1035f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1036f1af5d2fSBarry Smith     ii += 7;
1037f1af5d2fSBarry Smith   }
1038f1af5d2fSBarry Smith 
1039f1af5d2fSBarry Smith   /* forward solve the U^T */
1040f1af5d2fSBarry Smith   idx = 0;
1041f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1042f1af5d2fSBarry Smith 
1043f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1044f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1045f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1046f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1047f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1048f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1049f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1050f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1051f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1052f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1053f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1054f1af5d2fSBarry Smith     v += 49;
1055f1af5d2fSBarry Smith 
1056f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1057f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1058f1af5d2fSBarry Smith     while (nz--) {
1059f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1060f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1061f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1062f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1063f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1064f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1065f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1066f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1067f1af5d2fSBarry Smith       v  += 49;
1068f1af5d2fSBarry Smith     }
1069f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1070f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1071f1af5d2fSBarry Smith     idx += 7;
1072f1af5d2fSBarry Smith   }
1073f1af5d2fSBarry Smith   /* backward solve the L^T */
1074f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1075f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1076f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1077f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1078f1af5d2fSBarry Smith     idt  = 7*i;
1079f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1080f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1081f1af5d2fSBarry Smith     while (nz--) {
1082f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1083f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1084f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1085f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1086f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1087f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1088f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1089f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1090f1af5d2fSBarry Smith       v -= 49;
1091f1af5d2fSBarry Smith     }
1092f1af5d2fSBarry Smith   }
1093f1af5d2fSBarry Smith 
1094f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1095f1af5d2fSBarry Smith   ii = 0;
1096f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1097f1af5d2fSBarry Smith     ir      = 7*r[i];
1098f1af5d2fSBarry Smith     x[ir]   = t[ii];
1099f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1100f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1101f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1102f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1103f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1104f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1105f1af5d2fSBarry Smith     ii += 7;
1106f1af5d2fSBarry Smith   }
1107f1af5d2fSBarry Smith 
1108f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1109f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11101ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1112dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1113f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1114f1af5d2fSBarry Smith }
1115f1af5d2fSBarry Smith 
11164e2b4712SSatish Balay /* ----------------------------------------------------------- */
11174a2ae208SSatish Balay #undef __FUNCT__
11184a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1119dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11204e2b4712SSatish Balay {
11214e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11224e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11236849ba73SBarry Smith   PetscErrorCode ierr;
11245d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11255d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11265d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11273f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
112887828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11294e2b4712SSatish Balay 
11304e2b4712SSatish Balay   PetscFunctionBegin;
11311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1133f1af5d2fSBarry Smith   t  = a->solve_work;
11344e2b4712SSatish Balay 
11354e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11364e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11374e2b4712SSatish Balay 
11384e2b4712SSatish Balay   /* forward solve the lower triangular */
113987828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11404e2b4712SSatish Balay   for (i=1; i<n; i++) {
11414e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11424e2b4712SSatish Balay     vi  = aj + ai[i];
11434e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1144f1af5d2fSBarry Smith     s = t + bs*i;
114587828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11464e2b4712SSatish Balay     while (nz--) {
1147f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11484e2b4712SSatish Balay       v += bs2;
11494e2b4712SSatish Balay     }
11504e2b4712SSatish Balay   }
11514e2b4712SSatish Balay   /* backward solve the upper triangular */
1152d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11534e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11544e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11554e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11564e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115787828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11584e2b4712SSatish Balay     while (nz--) {
1159f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11604e2b4712SSatish Balay       v += bs2;
11614e2b4712SSatish Balay     }
1162f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116387828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11644e2b4712SSatish Balay   }
11654e2b4712SSatish Balay 
11664e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11674e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11681ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11691ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11714e2b4712SSatish Balay   PetscFunctionReturn(0);
11724e2b4712SSatish Balay }
11734e2b4712SSatish Balay 
11744a2ae208SSatish Balay #undef __FUNCT__
11754a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1176dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11774e2b4712SSatish Balay {
11784e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11794e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11806849ba73SBarry Smith   PetscErrorCode ierr;
11815d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11825d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11833f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11864e2b4712SSatish Balay 
11874e2b4712SSatish Balay   PetscFunctionBegin;
11881ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1190f1af5d2fSBarry Smith   t  = a->solve_work;
11914e2b4712SSatish Balay 
11924e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11934e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11944e2b4712SSatish Balay 
11954e2b4712SSatish Balay   /* forward solve the lower triangular */
11964e2b4712SSatish Balay   idx    = 7*(*r++);
1197f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1198f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1199f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12004e2b4712SSatish Balay 
12014e2b4712SSatish Balay   for (i=1; i<n; i++) {
12024e2b4712SSatish Balay     v     = aa + 49*ai[i];
12034e2b4712SSatish Balay     vi    = aj + ai[i];
12044e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12054e2b4712SSatish Balay     idx   = 7*(*r++);
1206f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1207f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12084e2b4712SSatish Balay     while (nz--) {
12094e2b4712SSatish Balay       idx   = 7*(*vi++);
1210f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1211f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1212f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1213f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1214f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1215f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1216f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1217f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1218f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1219f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12204e2b4712SSatish Balay       v += 49;
12214e2b4712SSatish Balay     }
12224e2b4712SSatish Balay     idx = 7*i;
1223f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1224f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1225f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12264e2b4712SSatish Balay   }
12274e2b4712SSatish Balay   /* backward solve the upper triangular */
12284e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12294e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12304e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12314e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12324e2b4712SSatish Balay     idt  = 7*i;
1233f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1234f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1235f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12364e2b4712SSatish Balay     while (nz--) {
12374e2b4712SSatish Balay       idx   = 7*(*vi++);
1238f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1239f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1240f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1241f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1242f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1243f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1244f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1245f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1246f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1247f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12484e2b4712SSatish Balay       v += 49;
12494e2b4712SSatish Balay     }
12504e2b4712SSatish Balay     idc = 7*(*c--);
12514e2b4712SSatish Balay     v   = aa + 49*diag[i];
1252f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1253f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1254f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1255f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1256f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1257f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1258f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1259f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1260f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1261f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1262f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1263f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1264f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1265f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12664e2b4712SSatish Balay   }
12674e2b4712SSatish Balay 
12684e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12694e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1272dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12734e2b4712SSatish Balay   PetscFunctionReturn(0);
12744e2b4712SSatish Balay }
12754e2b4712SSatish Balay 
12764a2ae208SSatish Balay #undef __FUNCT__
12774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1278dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
127915091d37SBarry Smith {
128015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1281690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1282dfbe8321SBarry Smith   PetscErrorCode    ierr;
1283690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1284d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1285d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1286d9fead3dSBarry Smith   const PetscScalar *b;
128715091d37SBarry Smith 
128815091d37SBarry Smith   PetscFunctionBegin;
1289d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12901ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129115091d37SBarry Smith   /* forward solve the lower triangular */
129215091d37SBarry Smith   idx    = 0;
129315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
129415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
129515091d37SBarry Smith   x[6] = b[6+idx];
129615091d37SBarry Smith   for (i=1; i<n; i++) {
129715091d37SBarry Smith     v     =  aa + 49*ai[i];
129815091d37SBarry Smith     vi    =  aj + ai[i];
129915091d37SBarry Smith     nz    =  diag[i] - ai[i];
130015091d37SBarry Smith     idx   =  7*i;
1301f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1302f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1303f1af5d2fSBarry Smith     s7  =  b[6+idx];
130415091d37SBarry Smith     while (nz--) {
130515091d37SBarry Smith       jdx   = 7*(*vi++);
130615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
130715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
130815091d37SBarry Smith       x7    = x[6+jdx];
1309f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1310f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1311f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1312f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1313f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1314f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1315f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
131615091d37SBarry Smith       v += 49;
131715091d37SBarry Smith      }
1318f1af5d2fSBarry Smith     x[idx]   = s1;
1319f1af5d2fSBarry Smith     x[1+idx] = s2;
1320f1af5d2fSBarry Smith     x[2+idx] = s3;
1321f1af5d2fSBarry Smith     x[3+idx] = s4;
1322f1af5d2fSBarry Smith     x[4+idx] = s5;
1323f1af5d2fSBarry Smith     x[5+idx] = s6;
1324f1af5d2fSBarry Smith     x[6+idx] = s7;
132515091d37SBarry Smith   }
132615091d37SBarry Smith   /* backward solve the upper triangular */
132715091d37SBarry Smith   for (i=n-1; i>=0; i--){
132815091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
132915091d37SBarry Smith     vi   = aj + diag[i] + 1;
133015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
133115091d37SBarry Smith     idt  = 7*i;
1332f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1333f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1334f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1335f1af5d2fSBarry Smith     s7 = x[6+idt];
133615091d37SBarry Smith     while (nz--) {
133715091d37SBarry Smith       idx   = 7*(*vi++);
133815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
133915091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
134015091d37SBarry Smith       x7    = x[6+idx];
1341f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1342f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1343f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1344f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1345f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1346f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1347f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134815091d37SBarry Smith       v += 49;
134915091d37SBarry Smith     }
135015091d37SBarry Smith     v        = aa + 49*diag[i];
1351f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1352f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1353f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1354f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1355f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1356f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1357f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1358f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1359f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1360f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1361f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1362f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1363f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1364f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
136515091d37SBarry Smith   }
136615091d37SBarry Smith 
1367d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1369dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
137015091d37SBarry Smith   PetscFunctionReturn(0);
137115091d37SBarry Smith }
137215091d37SBarry Smith 
13734a2ae208SSatish Balay #undef __FUNCT__
13744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1375dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
137615091d37SBarry Smith {
137715091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
137815091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
13796849ba73SBarry Smith   PetscErrorCode    ierr;
13805d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
13815d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1382d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1383d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1384d9fead3dSBarry Smith   const PetscScalar *b;
138515091d37SBarry Smith   PetscFunctionBegin;
1386d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13871ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1388f1af5d2fSBarry Smith   t  = a->solve_work;
138915091d37SBarry Smith 
139015091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
139115091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
139215091d37SBarry Smith 
139315091d37SBarry Smith   /* forward solve the lower triangular */
139415091d37SBarry Smith   idx    = 6*(*r++);
1395f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1396f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1397f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
139815091d37SBarry Smith   for (i=1; i<n; i++) {
139915091d37SBarry Smith     v     = aa + 36*ai[i];
140015091d37SBarry Smith     vi    = aj + ai[i];
140115091d37SBarry Smith     nz    = diag[i] - ai[i];
140215091d37SBarry Smith     idx   = 6*(*r++);
1403f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1404f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
140515091d37SBarry Smith     while (nz--) {
140615091d37SBarry Smith       idx   = 6*(*vi++);
1407f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1408f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1409f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1410f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1411f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1412f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1413f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1414f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
141515091d37SBarry Smith       v += 36;
141615091d37SBarry Smith     }
141715091d37SBarry Smith     idx = 6*i;
1418f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1419f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1420f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
142115091d37SBarry Smith   }
142215091d37SBarry Smith   /* backward solve the upper triangular */
142315091d37SBarry Smith   for (i=n-1; i>=0; i--){
142415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
142515091d37SBarry Smith     vi   = aj + diag[i] + 1;
142615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
142715091d37SBarry Smith     idt  = 6*i;
1428f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1429f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1430f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
143115091d37SBarry Smith     while (nz--) {
143215091d37SBarry Smith       idx   = 6*(*vi++);
1433f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1434f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1435f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1436f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1437f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1438f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1439f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1440f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1441f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
144215091d37SBarry Smith       v += 36;
144315091d37SBarry Smith     }
144415091d37SBarry Smith     idc = 6*(*c--);
144515091d37SBarry Smith     v   = aa + 36*diag[i];
1446f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1447f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1448f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1449f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1450f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1451f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1452f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1453f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1454f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1455f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1456f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1457f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
145815091d37SBarry Smith   }
145915091d37SBarry Smith 
146015091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
146115091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1462d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1464dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
146515091d37SBarry Smith   PetscFunctionReturn(0);
146615091d37SBarry Smith }
146715091d37SBarry Smith 
14684a2ae208SSatish Balay #undef __FUNCT__
14694a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1470dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
147115091d37SBarry Smith {
147215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1473690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1474dfbe8321SBarry Smith   PetscErrorCode    ierr;
1475690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1476d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1477d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1478d9fead3dSBarry Smith   const PetscScalar *b;
147915091d37SBarry Smith 
148015091d37SBarry Smith   PetscFunctionBegin;
1481d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
148315091d37SBarry Smith   /* forward solve the lower triangular */
148415091d37SBarry Smith   idx    = 0;
148515091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
148615091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
148715091d37SBarry Smith   for (i=1; i<n; i++) {
148815091d37SBarry Smith     v     =  aa + 36*ai[i];
148915091d37SBarry Smith     vi    =  aj + ai[i];
149015091d37SBarry Smith     nz    =  diag[i] - ai[i];
149115091d37SBarry Smith     idx   =  6*i;
1492f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1493f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
149415091d37SBarry Smith     while (nz--) {
149515091d37SBarry Smith       jdx   = 6*(*vi++);
149615091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
149715091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1498f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1499f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1500f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1501f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1502f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1503f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150415091d37SBarry Smith       v += 36;
150515091d37SBarry Smith      }
1506f1af5d2fSBarry Smith     x[idx]   = s1;
1507f1af5d2fSBarry Smith     x[1+idx] = s2;
1508f1af5d2fSBarry Smith     x[2+idx] = s3;
1509f1af5d2fSBarry Smith     x[3+idx] = s4;
1510f1af5d2fSBarry Smith     x[4+idx] = s5;
1511f1af5d2fSBarry Smith     x[5+idx] = s6;
151215091d37SBarry Smith   }
151315091d37SBarry Smith   /* backward solve the upper triangular */
151415091d37SBarry Smith   for (i=n-1; i>=0; i--){
151515091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
151615091d37SBarry Smith     vi   = aj + diag[i] + 1;
151715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
151815091d37SBarry Smith     idt  = 6*i;
1519f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1520f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1521f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
152215091d37SBarry Smith     while (nz--) {
152315091d37SBarry Smith       idx   = 6*(*vi++);
152415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
152515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1526f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1527f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1528f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1529f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1530f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1531f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
153215091d37SBarry Smith       v += 36;
153315091d37SBarry Smith     }
153415091d37SBarry Smith     v        = aa + 36*diag[i];
1535f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1536f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1537f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1538f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1539f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1540f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
154115091d37SBarry Smith   }
154215091d37SBarry Smith 
1543d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1545dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
154615091d37SBarry Smith   PetscFunctionReturn(0);
154715091d37SBarry Smith }
154815091d37SBarry Smith 
15494a2ae208SSatish Balay #undef __FUNCT__
15504a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1551dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15524e2b4712SSatish Balay {
15534e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
15544e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
15556849ba73SBarry Smith   PetscErrorCode    ierr;
15565d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
15575d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1558d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1559d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1560d9fead3dSBarry Smith   const PetscScalar *b;
15614e2b4712SSatish Balay 
15624e2b4712SSatish Balay   PetscFunctionBegin;
1563d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1565f1af5d2fSBarry Smith   t  = a->solve_work;
15664e2b4712SSatish Balay 
15674e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15684e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15694e2b4712SSatish Balay 
15704e2b4712SSatish Balay   /* forward solve the lower triangular */
15714e2b4712SSatish Balay   idx    = 5*(*r++);
1572f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1573f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15744e2b4712SSatish Balay   for (i=1; i<n; i++) {
15754e2b4712SSatish Balay     v     = aa + 25*ai[i];
15764e2b4712SSatish Balay     vi    = aj + ai[i];
15774e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15784e2b4712SSatish Balay     idx   = 5*(*r++);
1579f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1580f1af5d2fSBarry Smith     s5  = b[4+idx];
15814e2b4712SSatish Balay     while (nz--) {
15824e2b4712SSatish Balay       idx   = 5*(*vi++);
1583f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1584f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1585f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1586f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1587f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1588f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1589f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15904e2b4712SSatish Balay       v += 25;
15914e2b4712SSatish Balay     }
15924e2b4712SSatish Balay     idx = 5*i;
1593f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1594f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15954e2b4712SSatish Balay   }
15964e2b4712SSatish Balay   /* backward solve the upper triangular */
15974e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15984e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15994e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
16004e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
16014e2b4712SSatish Balay     idt  = 5*i;
1602f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1603f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
16044e2b4712SSatish Balay     while (nz--) {
16054e2b4712SSatish Balay       idx   = 5*(*vi++);
1606f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1607f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1608f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1609f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1610f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1611f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1612f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
16134e2b4712SSatish Balay       v += 25;
16144e2b4712SSatish Balay     }
16154e2b4712SSatish Balay     idc = 5*(*c--);
16164e2b4712SSatish Balay     v   = aa + 25*diag[i];
1617f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1618f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1619f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1620f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1621f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1622f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1623f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1624f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1625f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1626f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
16274e2b4712SSatish Balay   }
16284e2b4712SSatish Balay 
16294e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
16304e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1631d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16321ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1633dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
16344e2b4712SSatish Balay   PetscFunctionReturn(0);
16354e2b4712SSatish Balay }
16364e2b4712SSatish Balay 
1637*84a281e5SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1638*84a281e5SHong Zhang {
1639*84a281e5SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1640*84a281e5SHong Zhang   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1641*84a281e5SHong Zhang   PetscErrorCode    ierr;
1642*84a281e5SHong Zhang   PetscInt          jdx;
1643*84a281e5SHong Zhang   const MatScalar   *aa=a->a,*v;
1644*84a281e5SHong Zhang   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1645*84a281e5SHong Zhang   const PetscScalar *b;
1646*84a281e5SHong Zhang 
1647*84a281e5SHong Zhang   PetscFunctionBegin;
1648*84a281e5SHong Zhang   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1649*84a281e5SHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1650*84a281e5SHong Zhang   /* forward solve the lower triangular */
1651*84a281e5SHong Zhang   idx    = 0;
1652*84a281e5SHong Zhang   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
1653*84a281e5SHong Zhang   for (i=1; i<n; i++) {
1654*84a281e5SHong Zhang     v   = aa + 25*ai[i];
1655*84a281e5SHong Zhang     vi  = aj + ai[i];
1656*84a281e5SHong Zhang     nz  = ai[i+1] - ai[i];
1657*84a281e5SHong Zhang     idx = 5*i;
1658*84a281e5SHong Zhang     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
1659*84a281e5SHong Zhang     while (nz--) {
1660*84a281e5SHong Zhang       jdx   = 5*(*vi++);
1661*84a281e5SHong Zhang       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1662*84a281e5SHong Zhang       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1663*84a281e5SHong Zhang       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1664*84a281e5SHong Zhang       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1665*84a281e5SHong Zhang       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1666*84a281e5SHong Zhang       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1667*84a281e5SHong Zhang       v    += 25;
1668*84a281e5SHong Zhang     }
1669*84a281e5SHong Zhang     x[idx]   = s1;
1670*84a281e5SHong Zhang     x[1+idx] = s2;
1671*84a281e5SHong Zhang     x[2+idx] = s3;
1672*84a281e5SHong Zhang     x[3+idx] = s4;
1673*84a281e5SHong Zhang     x[4+idx] = s5;
1674*84a281e5SHong Zhang   }
1675*84a281e5SHong Zhang 
1676*84a281e5SHong Zhang   /* backward solve the upper triangular */
1677*84a281e5SHong Zhang   for (i=n-1; i>=0; i--){
1678*84a281e5SHong Zhang     v   = aa + 25*ai[2*n-i];
1679*84a281e5SHong Zhang     vi  = aj + ai[2*n-i];
1680*84a281e5SHong Zhang     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
1681*84a281e5SHong Zhang     idt = 5*i;
1682*84a281e5SHong Zhang     s1 = x[idt];  s2 = x[1+idt];
1683*84a281e5SHong Zhang     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
1684*84a281e5SHong Zhang     while (nz--) {
1685*84a281e5SHong Zhang       idx   = 5*(*vi++);
1686*84a281e5SHong Zhang       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1687*84a281e5SHong Zhang       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1688*84a281e5SHong Zhang       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1689*84a281e5SHong Zhang       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1690*84a281e5SHong Zhang       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1691*84a281e5SHong Zhang       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
1692*84a281e5SHong Zhang       v    += 25;
1693*84a281e5SHong Zhang     }
1694*84a281e5SHong Zhang     /* x = inv_diagonal*x */
1695*84a281e5SHong Zhang     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1696*84a281e5SHong Zhang     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1697*84a281e5SHong Zhang     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1698*84a281e5SHong Zhang     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1699*84a281e5SHong Zhang     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
1700*84a281e5SHong Zhang   }
1701*84a281e5SHong Zhang 
1702*84a281e5SHong Zhang   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1703*84a281e5SHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1704*84a281e5SHong Zhang   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1705*84a281e5SHong Zhang   PetscFunctionReturn(0);
1706*84a281e5SHong Zhang }
1707*84a281e5SHong Zhang 
17084a2ae208SSatish Balay #undef __FUNCT__
17094a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1710dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
171115091d37SBarry Smith {
171215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1713690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1714dfbe8321SBarry Smith   PetscErrorCode    ierr;
1715690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1716d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1717d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1718d9fead3dSBarry Smith   const PetscScalar *b;
171915091d37SBarry Smith 
172015091d37SBarry Smith   PetscFunctionBegin;
1721d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172315091d37SBarry Smith   /* forward solve the lower triangular */
172415091d37SBarry Smith   idx    = 0;
172515091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
172615091d37SBarry Smith   for (i=1; i<n; i++) {
172715091d37SBarry Smith     v     =  aa + 25*ai[i];
172815091d37SBarry Smith     vi    =  aj + ai[i];
172915091d37SBarry Smith     nz    =  diag[i] - ai[i];
173015091d37SBarry Smith     idx   =  5*i;
1731f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
173215091d37SBarry Smith     while (nz--) {
173315091d37SBarry Smith       jdx   = 5*(*vi++);
173415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1735f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1736f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1737f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1738f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1739f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
174015091d37SBarry Smith       v    += 25;
174115091d37SBarry Smith     }
1742f1af5d2fSBarry Smith     x[idx]   = s1;
1743f1af5d2fSBarry Smith     x[1+idx] = s2;
1744f1af5d2fSBarry Smith     x[2+idx] = s3;
1745f1af5d2fSBarry Smith     x[3+idx] = s4;
1746f1af5d2fSBarry Smith     x[4+idx] = s5;
174715091d37SBarry Smith   }
174815091d37SBarry Smith   /* backward solve the upper triangular */
174915091d37SBarry Smith   for (i=n-1; i>=0; i--){
175015091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
175115091d37SBarry Smith     vi   = aj + diag[i] + 1;
175215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
175315091d37SBarry Smith     idt  = 5*i;
1754f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1755f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
175615091d37SBarry Smith     while (nz--) {
175715091d37SBarry Smith       idx   = 5*(*vi++);
175815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1759f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1760f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1761f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1762f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1763f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
176415091d37SBarry Smith       v    += 25;
176515091d37SBarry Smith     }
176615091d37SBarry Smith     v        = aa + 25*diag[i];
1767f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1768f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1769f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1770f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1771f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
177215091d37SBarry Smith   }
177315091d37SBarry Smith 
1774d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17751ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1776dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
177715091d37SBarry Smith   PetscFunctionReturn(0);
177815091d37SBarry Smith }
177915091d37SBarry Smith 
17804a2ae208SSatish Balay #undef __FUNCT__
17814a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1782dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
17834e2b4712SSatish Balay {
17844e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
17854e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
17866849ba73SBarry Smith   PetscErrorCode    ierr;
17875d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
17885d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1789d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1790d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1791d9fead3dSBarry Smith   const PetscScalar *b;
17924e2b4712SSatish Balay 
17934e2b4712SSatish Balay   PetscFunctionBegin;
1794d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17951ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1796f1af5d2fSBarry Smith   t  = a->solve_work;
17974e2b4712SSatish Balay 
17984e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17994e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
18004e2b4712SSatish Balay 
18014e2b4712SSatish Balay   /* forward solve the lower triangular */
18024e2b4712SSatish Balay   idx    = 4*(*r++);
1803f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1804f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
18054e2b4712SSatish Balay   for (i=1; i<n; i++) {
18064e2b4712SSatish Balay     v     = aa + 16*ai[i];
18074e2b4712SSatish Balay     vi    = aj + ai[i];
18084e2b4712SSatish Balay     nz    = diag[i] - ai[i];
18094e2b4712SSatish Balay     idx   = 4*(*r++);
1810f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
18114e2b4712SSatish Balay     while (nz--) {
18124e2b4712SSatish Balay       idx   = 4*(*vi++);
1813f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1814f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1815f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1816f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1817f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
18184e2b4712SSatish Balay       v    += 16;
18194e2b4712SSatish Balay     }
18204e2b4712SSatish Balay     idx        = 4*i;
1821f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1822f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
18234e2b4712SSatish Balay   }
18244e2b4712SSatish Balay   /* backward solve the upper triangular */
18254e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
18264e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
18274e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
18284e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
18294e2b4712SSatish Balay     idt  = 4*i;
1830f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1831f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
18324e2b4712SSatish Balay     while (nz--) {
18334e2b4712SSatish Balay       idx   = 4*(*vi++);
1834f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1835f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1836f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1837f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1838f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1839f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
18404e2b4712SSatish Balay       v += 16;
18414e2b4712SSatish Balay     }
18424e2b4712SSatish Balay     idc      = 4*(*c--);
18434e2b4712SSatish Balay     v        = aa + 16*diag[i];
1844f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1845f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1846f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1847f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
18484e2b4712SSatish Balay   }
18494e2b4712SSatish Balay 
18504e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18514e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1852d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1854dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
18554e2b4712SSatish Balay   PetscFunctionReturn(0);
18564e2b4712SSatish Balay }
1857f26ec98cSKris Buschelman 
1858f26ec98cSKris Buschelman #undef __FUNCT__
1859f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1860dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1861f26ec98cSKris Buschelman {
1862f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1863f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
18646849ba73SBarry Smith   PetscErrorCode    ierr;
18655d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
18665d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1867d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1868d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1869d9fead3dSBarry Smith   PetscScalar       *x;
1870d9fead3dSBarry Smith   const PetscScalar *b;
1871f26ec98cSKris Buschelman 
1872f26ec98cSKris Buschelman   PetscFunctionBegin;
1873d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1875f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1876f26ec98cSKris Buschelman 
1877f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1878f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1879f26ec98cSKris Buschelman 
1880f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1881f26ec98cSKris Buschelman   idx    = 4*(*r++);
1882f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1883f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1884f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1885f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1886f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1887f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1888f26ec98cSKris Buschelman     vi    = aj + ai[i];
1889f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1890f26ec98cSKris Buschelman     idx   = 4*(*r++);
1891f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1892f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1893f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1894f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1895f26ec98cSKris Buschelman     while (nz--) {
1896f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1897f26ec98cSKris Buschelman       x1  = t[idx];
1898f26ec98cSKris Buschelman       x2  = t[1+idx];
1899f26ec98cSKris Buschelman       x3  = t[2+idx];
1900f26ec98cSKris Buschelman       x4  = t[3+idx];
1901f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1902f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1903f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1904f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1905f26ec98cSKris Buschelman       v    += 16;
1906f26ec98cSKris Buschelman     }
1907f26ec98cSKris Buschelman     idx        = 4*i;
1908f26ec98cSKris Buschelman     t[idx]   = s1;
1909f26ec98cSKris Buschelman     t[1+idx] = s2;
1910f26ec98cSKris Buschelman     t[2+idx] = s3;
1911f26ec98cSKris Buschelman     t[3+idx] = s4;
1912f26ec98cSKris Buschelman   }
1913f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1914f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1915f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1916f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1917f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1918f26ec98cSKris Buschelman     idt  = 4*i;
1919f26ec98cSKris Buschelman     s1 = t[idt];
1920f26ec98cSKris Buschelman     s2 = t[1+idt];
1921f26ec98cSKris Buschelman     s3 = t[2+idt];
1922f26ec98cSKris Buschelman     s4 = t[3+idt];
1923f26ec98cSKris Buschelman     while (nz--) {
1924f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1925f26ec98cSKris Buschelman       x1  = t[idx];
1926f26ec98cSKris Buschelman       x2  = t[1+idx];
1927f26ec98cSKris Buschelman       x3  = t[2+idx];
1928f26ec98cSKris Buschelman       x4  = t[3+idx];
1929f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1930f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1931f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1932f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1933f26ec98cSKris Buschelman       v += 16;
1934f26ec98cSKris Buschelman     }
1935f26ec98cSKris Buschelman     idc      = 4*(*c--);
1936f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1937f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1938f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1939f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1940f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1941f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1942f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1943f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1944f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1945f26ec98cSKris Buschelman  }
1946f26ec98cSKris Buschelman 
1947f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1948f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1949d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1952f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1953f26ec98cSKris Buschelman }
1954f26ec98cSKris Buschelman 
195524c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
195624c233c2SKris Buschelman 
195724c233c2SKris Buschelman #include PETSC_HAVE_SSE
195824c233c2SKris Buschelman 
195924c233c2SKris Buschelman #undef __FUNCT__
196024c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1961dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
196224c233c2SKris Buschelman {
196324c233c2SKris Buschelman   /*
196424c233c2SKris Buschelman      Note: This code uses demotion of double
196524c233c2SKris Buschelman      to float when performing the mixed-mode computation.
196624c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
196724c233c2SKris Buschelman   */
196824c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
196924c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
19706849ba73SBarry Smith   PetscErrorCode ierr;
19715d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
19725d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
197324c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
197487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
197524c233c2SKris Buschelman 
197624c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
197724c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
197824c233c2SKris Buschelman   unsigned long   offset;
197924c233c2SKris Buschelman 
198024c233c2SKris Buschelman   PetscFunctionBegin;
198124c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
198224c233c2SKris Buschelman 
198324c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
198424c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
198524c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
198624c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
198724c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
198824c233c2SKris Buschelman 
19891ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
19901ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
199124c233c2SKris Buschelman     t  = a->solve_work;
199224c233c2SKris Buschelman 
199324c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
199424c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
199524c233c2SKris Buschelman 
199624c233c2SKris Buschelman     /* forward solve the lower triangular */
199724c233c2SKris Buschelman     idx  = 4*(*r++);
199824c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
199924c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
200024c233c2SKris Buschelman     v    =  aa + 16*ai[1];
200124c233c2SKris Buschelman 
200224c233c2SKris Buschelman     for (i=1; i<n;) {
200324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
200424c233c2SKris Buschelman       vi   =  aj      + ai[i];
200524c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
200624c233c2SKris Buschelman       idx  =  4*(*r++);
200724c233c2SKris Buschelman 
200824c233c2SKris Buschelman       /* Demote sum from double to float */
200924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
201024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
201124c233c2SKris Buschelman 
201224c233c2SKris Buschelman       while (nz--) {
201324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
201424c233c2SKris Buschelman         idx = 4*(*vi++);
201524c233c2SKris Buschelman 
201624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
201724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
201824c233c2SKris Buschelman 
201924c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
202024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
202124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
202224c233c2SKris Buschelman 
202324c233c2SKris Buschelman           /* First Column */
202424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
202524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
202624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
202724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
202824c233c2SKris Buschelman 
202924c233c2SKris Buschelman           /* Second Column */
203024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
203124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
203224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
203324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
203424c233c2SKris Buschelman 
203524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
203624c233c2SKris Buschelman 
203724c233c2SKris Buschelman           /* Third Column */
203824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
203924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
204024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
204124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
204224c233c2SKris Buschelman 
204324c233c2SKris Buschelman           /* Fourth Column */
204424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
204524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
204624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
204724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
204824c233c2SKris Buschelman         SSE_INLINE_END_2
204924c233c2SKris Buschelman 
205024c233c2SKris Buschelman         v  += 16;
205124c233c2SKris Buschelman       }
205224c233c2SKris Buschelman       idx = 4*i;
205324c233c2SKris Buschelman       v   = aa + 16*ai[++i];
205424c233c2SKris Buschelman       PREFETCH_NTA(v);
205524c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
205624c233c2SKris Buschelman 
205724c233c2SKris Buschelman       /* Promote result from float to double */
205824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
205924c233c2SKris Buschelman     }
206024c233c2SKris Buschelman     /* backward solve the upper triangular */
206124c233c2SKris Buschelman     idt  = 4*(n-1);
206224c233c2SKris Buschelman     ai16 = 16*diag[n-1];
206324c233c2SKris Buschelman     v    = aa + ai16 + 16;
206424c233c2SKris Buschelman     for (i=n-1; i>=0;){
206524c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
206624c233c2SKris Buschelman       vi = aj + diag[i] + 1;
206724c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
206824c233c2SKris Buschelman 
206924c233c2SKris Buschelman       /* Demote accumulator from double to float */
207024c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
207124c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
207224c233c2SKris Buschelman 
207324c233c2SKris Buschelman       while (nz--) {
207424c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
207524c233c2SKris Buschelman         idx = 4*(*vi++);
207624c233c2SKris Buschelman 
207724c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
207824c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
207924c233c2SKris Buschelman 
208024c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
208124c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
208224c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
208324c233c2SKris Buschelman 
208424c233c2SKris Buschelman           /* First Column */
208524c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
208624c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
208724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
208824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
208924c233c2SKris Buschelman 
209024c233c2SKris Buschelman           /* Second Column */
209124c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
209224c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
209324c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
209424c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
209524c233c2SKris Buschelman 
209624c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
209724c233c2SKris Buschelman 
209824c233c2SKris Buschelman           /* Third Column */
209924c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
210024c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
210124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
210224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
210324c233c2SKris Buschelman 
210424c233c2SKris Buschelman           /* Fourth Column */
210524c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
210624c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
210724c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
210824c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
210924c233c2SKris Buschelman         SSE_INLINE_END_2
211024c233c2SKris Buschelman         v  += 16;
211124c233c2SKris Buschelman       }
211224c233c2SKris Buschelman       v    = aa + ai16;
211324c233c2SKris Buschelman       ai16 = 16*diag[--i];
211424c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
211524c233c2SKris Buschelman       /*
211624c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
211724c233c2SKris Buschelman          which was inverted as part of the factorization
211824c233c2SKris Buschelman       */
211924c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
212024c233c2SKris Buschelman         /* First Column */
212124c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
212224c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
212324c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
212424c233c2SKris Buschelman 
212524c233c2SKris Buschelman         /* Second Column */
212624c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
212724c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
212824c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
212924c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
213024c233c2SKris Buschelman 
213124c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
213224c233c2SKris Buschelman 
213324c233c2SKris Buschelman         /* Third Column */
213424c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
213524c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
213624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
213724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
213824c233c2SKris Buschelman 
213924c233c2SKris Buschelman         /* Fourth Column */
214024c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
214124c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
214224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
214324c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
214424c233c2SKris Buschelman 
214524c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
214624c233c2SKris Buschelman       SSE_INLINE_END_3
214724c233c2SKris Buschelman 
214824c233c2SKris Buschelman       /* Promote solution from float to double */
214924c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
215024c233c2SKris Buschelman 
215124c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
215224c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
215324c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
215424c233c2SKris Buschelman       idc  = 4*(*c--);
215524c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
215624c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
215724c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
215824c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
215924c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
216024c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
216124c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
216224c233c2SKris Buschelman       SSE_INLINE_END_2
216324c233c2SKris Buschelman       v    = aa + ai16 + 16;
216424c233c2SKris Buschelman       idt -= 4;
216524c233c2SKris Buschelman     }
216624c233c2SKris Buschelman 
216724c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
216824c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21691ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
21701ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2171dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
217224c233c2SKris Buschelman   SSE_SCOPE_END;
217324c233c2SKris Buschelman   PetscFunctionReturn(0);
217424c233c2SKris Buschelman }
217524c233c2SKris Buschelman 
217624c233c2SKris Buschelman #endif
21770ef38995SBarry Smith 
21780ef38995SBarry Smith 
21794e2b4712SSatish Balay /*
21804e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
21814e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
21824e2b4712SSatish Balay */
21834a2ae208SSatish Balay #undef __FUNCT__
21844a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2185dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
21864e2b4712SSatish Balay {
21874e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2188356650c2SBarry Smith   PetscInt          n=a->mbs;
2189356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2190dfbe8321SBarry Smith   PetscErrorCode    ierr;
2191356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2192d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2193d9fead3dSBarry Smith   PetscScalar       *x;
2194d9fead3dSBarry Smith   const PetscScalar *b;
21954e2b4712SSatish Balay 
21964e2b4712SSatish Balay   PetscFunctionBegin;
2197d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21981ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21994e2b4712SSatish Balay 
2200aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
22012853dc0eSBarry Smith   {
220287828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
22032853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
22042853dc0eSBarry Smith   }
2205aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
22062853dc0eSBarry Smith   {
220787828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
22082853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
22092853dc0eSBarry Smith   }
2210aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
22112853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2212e1293385SBarry Smith #else
221330d4dcafSBarry Smith   {
221487828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2215d9fead3dSBarry Smith     const MatScalar *v;
2216356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2217356650c2SBarry Smith     const PetscInt  *vi;
2218e1293385SBarry Smith 
22194e2b4712SSatish Balay   /* forward solve the lower triangular */
22204e2b4712SSatish Balay   idx    = 0;
2221e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
22224e2b4712SSatish Balay   for (i=1; i<n; i++) {
22234e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
22244e2b4712SSatish Balay     vi    =  aj      + ai[i];
22254e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2226e1293385SBarry Smith     idx   +=  4;
2227f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
22284e2b4712SSatish Balay     while (nz--) {
22294e2b4712SSatish Balay       jdx   = 4*(*vi++);
22304e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2231f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2232f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2233f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2234f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
22354e2b4712SSatish Balay       v    += 16;
22364e2b4712SSatish Balay     }
2237f1af5d2fSBarry Smith     x[idx]   = s1;
2238f1af5d2fSBarry Smith     x[1+idx] = s2;
2239f1af5d2fSBarry Smith     x[2+idx] = s3;
2240f1af5d2fSBarry Smith     x[3+idx] = s4;
22414e2b4712SSatish Balay   }
22424e2b4712SSatish Balay   /* backward solve the upper triangular */
22434e555682SBarry Smith   idt = 4*(n-1);
22444e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
22454e555682SBarry Smith     ai16 = 16*diag[i];
22464e555682SBarry Smith     v    = aa + ai16 + 16;
22474e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
22484e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2249f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2250f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
22514e2b4712SSatish Balay     while (nz--) {
22524e2b4712SSatish Balay       idx   = 4*(*vi++);
22534e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2254f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2255f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2256f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2257f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
22584e2b4712SSatish Balay       v    += 16;
22594e2b4712SSatish Balay     }
22604e555682SBarry Smith     v        = aa + ai16;
2261f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2262f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2263f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2264f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2265329f5518SBarry Smith     idt -= 4;
22664e2b4712SSatish Balay   }
226730d4dcafSBarry Smith   }
2268e1293385SBarry Smith #endif
22694e2b4712SSatish Balay 
2270d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2272dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
22734e2b4712SSatish Balay   PetscFunctionReturn(0);
22744e2b4712SSatish Balay }
22754e2b4712SSatish Balay 
2276f26ec98cSKris Buschelman #undef __FUNCT__
2277f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2278dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2279f26ec98cSKris Buschelman {
2280f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2281690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2282dfbe8321SBarry Smith   PetscErrorCode ierr;
2283690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2284f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2285f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2286f26ec98cSKris Buschelman 
2287f26ec98cSKris Buschelman   PetscFunctionBegin;
22881ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2290f26ec98cSKris Buschelman 
2291f26ec98cSKris Buschelman   {
2292f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2293f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2294690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2295f26ec98cSKris Buschelman 
2296f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2297f26ec98cSKris Buschelman     idx  = 0;
2298f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2299f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2300f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2301f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2302f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2303f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2304f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2305f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2306f26ec98cSKris Buschelman       idx   +=  4;
2307f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2308f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2309f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2310f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2311f26ec98cSKris Buschelman       while (nz--) {
2312f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2313f26ec98cSKris Buschelman         x1  = t[jdx];
2314f26ec98cSKris Buschelman         x2  = t[1+jdx];
2315f26ec98cSKris Buschelman         x3  = t[2+jdx];
2316f26ec98cSKris Buschelman         x4  = t[3+jdx];
2317f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2318f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2319f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2320f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2321f26ec98cSKris Buschelman         v    += 16;
2322f26ec98cSKris Buschelman       }
2323f26ec98cSKris Buschelman       t[idx]   = s1;
2324f26ec98cSKris Buschelman       t[1+idx] = s2;
2325f26ec98cSKris Buschelman       t[2+idx] = s3;
2326f26ec98cSKris Buschelman       t[3+idx] = s4;
2327f26ec98cSKris Buschelman     }
2328f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2329f26ec98cSKris Buschelman     idt = 4*(n-1);
2330f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2331f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2332f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2333f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2334f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2335f26ec98cSKris Buschelman       s1   = t[idt];
2336f26ec98cSKris Buschelman       s2   = t[1+idt];
2337f26ec98cSKris Buschelman       s3   = t[2+idt];
2338f26ec98cSKris Buschelman       s4   = t[3+idt];
2339f26ec98cSKris Buschelman       while (nz--) {
2340f26ec98cSKris Buschelman         idx = 4*(*vi++);
2341f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2342f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2343f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2344f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2345f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2346f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2347f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2348f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2349f26ec98cSKris Buschelman         v    += 16;
2350f26ec98cSKris Buschelman       }
2351f26ec98cSKris Buschelman       v        = aa + ai16;
2352f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2353f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2354f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2355f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2356f26ec98cSKris Buschelman       idt -= 4;
2357f26ec98cSKris Buschelman     }
2358f26ec98cSKris Buschelman   }
2359f26ec98cSKris Buschelman 
23601ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
23611ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2362dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2363f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2364f26ec98cSKris Buschelman }
2365f26ec98cSKris Buschelman 
23663660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
23673660e330SKris Buschelman 
23683660e330SKris Buschelman #include PETSC_HAVE_SSE
23693660e330SKris Buschelman #undef __FUNCT__
23707cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2371dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
23723660e330SKris Buschelman {
23733660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
23742aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2375dfbe8321SBarry Smith   PetscErrorCode ierr;
2376dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
23773660e330SKris Buschelman   MatScalar      *aa=a->a;
237887828ca2SBarry Smith   PetscScalar    *x,*b;
23793660e330SKris Buschelman 
23803660e330SKris Buschelman   PetscFunctionBegin;
23813660e330SKris Buschelman   SSE_SCOPE_BEGIN;
23823660e330SKris Buschelman   /*
23833660e330SKris Buschelman      Note: This code currently uses demotion of double
23843660e330SKris Buschelman      to float when performing the mixed-mode computation.
23853660e330SKris Buschelman      This may not be numerically reasonable for all applications.
23863660e330SKris Buschelman   */
23873660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
23883660e330SKris Buschelman 
23891ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23901ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23913660e330SKris Buschelman   {
2392eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2393eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
23942aa5897fSKris Buschelman     int            nz,i,idt,ai16;
23952aa5897fSKris Buschelman     unsigned int   jdx,idx;
23962aa5897fSKris Buschelman     unsigned short *vi;
2397eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
23983660e330SKris Buschelman 
2399eb05f457SKris Buschelman     /* First block is the identity. */
24003660e330SKris Buschelman     idx  = 0;
2401eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
24022aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
24033660e330SKris Buschelman 
24043660e330SKris Buschelman     for (i=1; i<n;) {
24053660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
24063660e330SKris Buschelman       vi   =  aj      + ai[i];
24073660e330SKris Buschelman       nz   =  diag[i] - ai[i];
24083660e330SKris Buschelman       idx +=  4;
24093660e330SKris Buschelman 
2410eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2411eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2412eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
24133660e330SKris Buschelman 
24143660e330SKris Buschelman       while (nz--) {
24153660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
24162aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
24173660e330SKris Buschelman 
24183660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2419eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
24203660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
24213660e330SKris Buschelman 
24223660e330SKris Buschelman           /* First Column */
24233660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
24243660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
24253660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
24263660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
24273660e330SKris Buschelman 
24283660e330SKris Buschelman           /* Second Column */
24293660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
24303660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
24313660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
24323660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
24333660e330SKris Buschelman 
24343660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
24353660e330SKris Buschelman 
24363660e330SKris Buschelman           /* Third Column */
24373660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24383660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24393660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24403660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24413660e330SKris Buschelman 
24423660e330SKris Buschelman           /* Fourth Column */
24433660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
24443660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
24453660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
24463660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
24473660e330SKris Buschelman         SSE_INLINE_END_2
24483660e330SKris Buschelman 
24493660e330SKris Buschelman         v  += 16;
24503660e330SKris Buschelman       }
24513660e330SKris Buschelman       v    =  aa + 16*ai[++i];
24523660e330SKris Buschelman       PREFETCH_NTA(v);
2453eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
24543660e330SKris Buschelman     }
2455eb05f457SKris Buschelman 
2456eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2457eb05f457SKris Buschelman 
24583660e330SKris Buschelman     idt  = 4*(n-1);
24593660e330SKris Buschelman     ai16 = 16*diag[n-1];
24603660e330SKris Buschelman     v    = aa + ai16 + 16;
24613660e330SKris Buschelman     for (i=n-1; i>=0;){
24623660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
24633660e330SKris Buschelman       vi = aj + diag[i] + 1;
24643660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
24653660e330SKris Buschelman 
2466eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
24673660e330SKris Buschelman 
24683660e330SKris Buschelman       while (nz--) {
24693660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
24702aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
24713660e330SKris Buschelman 
24723660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2473eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
24743660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
24753660e330SKris Buschelman 
24763660e330SKris Buschelman           /* First Column */
24773660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
24783660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
24793660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
24803660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
24813660e330SKris Buschelman 
24823660e330SKris Buschelman           /* Second Column */
24833660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
24843660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
24853660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
24863660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
24873660e330SKris Buschelman 
24883660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
24893660e330SKris Buschelman 
24903660e330SKris Buschelman           /* Third Column */
24913660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24923660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24933660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24943660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24953660e330SKris Buschelman 
24963660e330SKris Buschelman           /* Fourth Column */
24973660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
24983660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
24993660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25003660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25013660e330SKris Buschelman         SSE_INLINE_END_2
25023660e330SKris Buschelman         v  += 16;
25033660e330SKris Buschelman       }
25043660e330SKris Buschelman       v    = aa + ai16;
25053660e330SKris Buschelman       ai16 = 16*diag[--i];
25063660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
25073660e330SKris Buschelman       /*
25083660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
25093660e330SKris Buschelman          which was inverted as part of the factorization
25103660e330SKris Buschelman       */
2511eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
25123660e330SKris Buschelman         /* First Column */
25133660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
25143660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
25153660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
25163660e330SKris Buschelman 
25173660e330SKris Buschelman         /* Second Column */
25183660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
25193660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
25203660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
25213660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
25223660e330SKris Buschelman 
25233660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
25243660e330SKris Buschelman 
25253660e330SKris Buschelman         /* Third Column */
25263660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
25273660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
25283660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
25293660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
25303660e330SKris Buschelman 
25313660e330SKris Buschelman         /* Fourth Column */
25323660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
25333660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
25343660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
25353660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
25363660e330SKris Buschelman 
25373660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
25383660e330SKris Buschelman       SSE_INLINE_END_3
25393660e330SKris Buschelman 
25403660e330SKris Buschelman       v    = aa + ai16 + 16;
25413660e330SKris Buschelman       idt -= 4;
25423660e330SKris Buschelman     }
2543eb05f457SKris Buschelman 
2544eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2545eb05f457SKris Buschelman     idt = 4*(n-1);
2546eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2547eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2548eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2549eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2550eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2551eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2552eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2553eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2554eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
255554693613SKris Buschelman       idt -= 4;
25563660e330SKris Buschelman     }
2557eb05f457SKris Buschelman 
2558eb05f457SKris Buschelman   } /* End of artificial scope. */
25591ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
25601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2561dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
25623660e330SKris Buschelman   SSE_SCOPE_END;
25633660e330SKris Buschelman   PetscFunctionReturn(0);
25643660e330SKris Buschelman }
25653660e330SKris Buschelman 
25667cf1b8d3SKris Buschelman #undef __FUNCT__
25677cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2568dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
25697cf1b8d3SKris Buschelman {
25707cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
25717cf1b8d3SKris Buschelman   int            *aj=a->j;
2572dfbe8321SBarry Smith   PetscErrorCode ierr;
2573dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
25747cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
25757cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
25767cf1b8d3SKris Buschelman 
25777cf1b8d3SKris Buschelman   PetscFunctionBegin;
25787cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
25797cf1b8d3SKris Buschelman   /*
25807cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
25817cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
25827cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
25837cf1b8d3SKris Buschelman   */
25847cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
25857cf1b8d3SKris Buschelman 
25861ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25871ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25887cf1b8d3SKris Buschelman   {
25897cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
25907cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
25917cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
25927cf1b8d3SKris Buschelman     int       jdx,idx;
25937cf1b8d3SKris Buschelman     int       *vi;
25947cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
25957cf1b8d3SKris Buschelman 
25967cf1b8d3SKris Buschelman     /* First block is the identity. */
25977cf1b8d3SKris Buschelman     idx  = 0;
25987cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
25997cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
26007cf1b8d3SKris Buschelman 
26017cf1b8d3SKris Buschelman     for (i=1; i<n;) {
26027cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
26037cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
26047cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
26057cf1b8d3SKris Buschelman       idx +=  4;
26067cf1b8d3SKris Buschelman 
26077cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
26087cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
26097cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
26107cf1b8d3SKris Buschelman 
26117cf1b8d3SKris Buschelman       while (nz--) {
26127cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
26137cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
26147cf1b8d3SKris Buschelman /*          jdx = *vi++; */
26157cf1b8d3SKris Buschelman 
26167cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
26177cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
26187cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26197cf1b8d3SKris Buschelman 
26207cf1b8d3SKris Buschelman           /* First Column */
26217cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26227cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26237cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26247cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26257cf1b8d3SKris Buschelman 
26267cf1b8d3SKris Buschelman           /* Second Column */
26277cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26287cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26297cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26307cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26317cf1b8d3SKris Buschelman 
26327cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26337cf1b8d3SKris Buschelman 
26347cf1b8d3SKris Buschelman           /* Third Column */
26357cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26367cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26377cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26387cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26397cf1b8d3SKris Buschelman 
26407cf1b8d3SKris Buschelman           /* Fourth Column */
26417cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26427cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26437cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26447cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26457cf1b8d3SKris Buschelman         SSE_INLINE_END_2
26467cf1b8d3SKris Buschelman 
26477cf1b8d3SKris Buschelman         v  += 16;
26487cf1b8d3SKris Buschelman       }
26497cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
26507cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
26517cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
26527cf1b8d3SKris Buschelman     }
26537cf1b8d3SKris Buschelman 
26547cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
26557cf1b8d3SKris Buschelman 
26567cf1b8d3SKris Buschelman     idt  = 4*(n-1);
26577cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
26587cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
26597cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
26607cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
26617cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
26627cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
26637cf1b8d3SKris Buschelman 
26647cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
26657cf1b8d3SKris Buschelman 
26667cf1b8d3SKris Buschelman       while (nz--) {
26677cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
26687cf1b8d3SKris Buschelman         idx = 4*(*vi++);
26697cf1b8d3SKris Buschelman /*          idx = *vi++; */
26707cf1b8d3SKris Buschelman 
26717cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
26727cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
26737cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26747cf1b8d3SKris Buschelman 
26757cf1b8d3SKris Buschelman           /* First Column */
26767cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26777cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26787cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26797cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26807cf1b8d3SKris Buschelman 
26817cf1b8d3SKris Buschelman           /* Second Column */
26827cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26837cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26847cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26857cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26867cf1b8d3SKris Buschelman 
26877cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26887cf1b8d3SKris Buschelman 
26897cf1b8d3SKris Buschelman           /* Third Column */
26907cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26917cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26927cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26937cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26947cf1b8d3SKris Buschelman 
26957cf1b8d3SKris Buschelman           /* Fourth Column */
26967cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26977cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26987cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26997cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
27007cf1b8d3SKris Buschelman         SSE_INLINE_END_2
27017cf1b8d3SKris Buschelman         v  += 16;
27027cf1b8d3SKris Buschelman       }
27037cf1b8d3SKris Buschelman       v    = aa + ai16;
27047cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
27057cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
27067cf1b8d3SKris Buschelman       /*
27077cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
27087cf1b8d3SKris Buschelman          which was inverted as part of the factorization
27097cf1b8d3SKris Buschelman       */
27107cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
27117cf1b8d3SKris Buschelman         /* First Column */
27127cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
27137cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
27147cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
27157cf1b8d3SKris Buschelman 
27167cf1b8d3SKris Buschelman         /* Second Column */
27177cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
27187cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
27197cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
27207cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
27217cf1b8d3SKris Buschelman 
27227cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
27237cf1b8d3SKris Buschelman 
27247cf1b8d3SKris Buschelman         /* Third Column */
27257cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
27267cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
27277cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
27287cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
27297cf1b8d3SKris Buschelman 
27307cf1b8d3SKris Buschelman         /* Fourth Column */
27317cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
27327cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
27337cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
27347cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
27357cf1b8d3SKris Buschelman 
27367cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
27377cf1b8d3SKris Buschelman       SSE_INLINE_END_3
27387cf1b8d3SKris Buschelman 
27397cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
27407cf1b8d3SKris Buschelman       idt -= 4;
27417cf1b8d3SKris Buschelman     }
27427cf1b8d3SKris Buschelman 
27437cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
27447cf1b8d3SKris Buschelman     idt = 4*(n-1);
27457cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
27467cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
27477cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
27487cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
27497cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
27507cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
27517cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
27527cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
27537cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
27547cf1b8d3SKris Buschelman       idt -= 4;
27557cf1b8d3SKris Buschelman     }
27567cf1b8d3SKris Buschelman 
27577cf1b8d3SKris Buschelman   } /* End of artificial scope. */
27581ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
27591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2760dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
27617cf1b8d3SKris Buschelman   SSE_SCOPE_END;
27627cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
27637cf1b8d3SKris Buschelman }
27647cf1b8d3SKris Buschelman 
27653660e330SKris Buschelman #endif
27664a2ae208SSatish Balay #undef __FUNCT__
27674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2768dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
27694e2b4712SSatish Balay {
27704e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
27714e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
27726849ba73SBarry Smith   PetscErrorCode    ierr;
27735d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
27745d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2775d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2776d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2777d9fead3dSBarry Smith   const PetscScalar *b;
27784e2b4712SSatish Balay 
27794e2b4712SSatish Balay   PetscFunctionBegin;
2780d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2782f1af5d2fSBarry Smith   t  = a->solve_work;
27834e2b4712SSatish Balay 
27844e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
27854e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
27864e2b4712SSatish Balay 
27874e2b4712SSatish Balay   /* forward solve the lower triangular */
27884e2b4712SSatish Balay   idx    = 3*(*r++);
2789f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
27904e2b4712SSatish Balay   for (i=1; i<n; i++) {
27914e2b4712SSatish Balay     v     = aa + 9*ai[i];
27924e2b4712SSatish Balay     vi    = aj + ai[i];
27934e2b4712SSatish Balay     nz    = diag[i] - ai[i];
27944e2b4712SSatish Balay     idx   = 3*(*r++);
2795f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
27964e2b4712SSatish Balay     while (nz--) {
27974e2b4712SSatish Balay       idx   = 3*(*vi++);
2798f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2799f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2800f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2801f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
28024e2b4712SSatish Balay       v += 9;
28034e2b4712SSatish Balay     }
28044e2b4712SSatish Balay     idx = 3*i;
2805f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
28064e2b4712SSatish Balay   }
28074e2b4712SSatish Balay   /* backward solve the upper triangular */
28084e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28094e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
28104e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28114e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28124e2b4712SSatish Balay     idt  = 3*i;
2813f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
28144e2b4712SSatish Balay     while (nz--) {
28154e2b4712SSatish Balay       idx   = 3*(*vi++);
2816f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2817f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2818f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2819f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
28204e2b4712SSatish Balay       v += 9;
28214e2b4712SSatish Balay     }
28224e2b4712SSatish Balay     idc = 3*(*c--);
28234e2b4712SSatish Balay     v   = aa + 9*diag[i];
2824f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2825f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2826f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
28274e2b4712SSatish Balay   }
28284e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28294e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2830d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28311ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2832dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
28334e2b4712SSatish Balay   PetscFunctionReturn(0);
28344e2b4712SSatish Balay }
28354e2b4712SSatish Balay 
283615091d37SBarry Smith /*
283715091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
283815091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
283915091d37SBarry Smith */
28404a2ae208SSatish Balay #undef __FUNCT__
28414a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2842dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
284315091d37SBarry Smith {
284415091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2845690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2846dfbe8321SBarry Smith   PetscErrorCode    ierr;
2847690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2848d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2849d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2850d9fead3dSBarry Smith   const PetscScalar *b;
2851690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
285215091d37SBarry Smith 
285315091d37SBarry Smith   PetscFunctionBegin;
2854d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28551ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
285615091d37SBarry Smith 
285715091d37SBarry Smith   /* forward solve the lower triangular */
285815091d37SBarry Smith   idx    = 0;
285915091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
286015091d37SBarry Smith   for (i=1; i<n; i++) {
286115091d37SBarry Smith     v     =  aa      + 9*ai[i];
286215091d37SBarry Smith     vi    =  aj      + ai[i];
286315091d37SBarry Smith     nz    =  diag[i] - ai[i];
286415091d37SBarry Smith     idx   +=  3;
2865f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
286615091d37SBarry Smith     while (nz--) {
286715091d37SBarry Smith       jdx   = 3*(*vi++);
286815091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2869f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2870f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2871f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
287215091d37SBarry Smith       v    += 9;
287315091d37SBarry Smith     }
2874f1af5d2fSBarry Smith     x[idx]   = s1;
2875f1af5d2fSBarry Smith     x[1+idx] = s2;
2876f1af5d2fSBarry Smith     x[2+idx] = s3;
287715091d37SBarry Smith   }
287815091d37SBarry Smith   /* backward solve the upper triangular */
287915091d37SBarry Smith   for (i=n-1; i>=0; i--){
288015091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
288115091d37SBarry Smith     vi   = aj + diag[i] + 1;
288215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
288315091d37SBarry Smith     idt  = 3*i;
2884f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2885f1af5d2fSBarry Smith     s3 = x[2+idt];
288615091d37SBarry Smith     while (nz--) {
288715091d37SBarry Smith       idx   = 3*(*vi++);
288815091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2889f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2890f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2891f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
289215091d37SBarry Smith       v    += 9;
289315091d37SBarry Smith     }
289415091d37SBarry Smith     v        = aa +  9*diag[i];
2895f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2896f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2897f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
289815091d37SBarry Smith   }
289915091d37SBarry Smith 
2900d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29011ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2902dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
290315091d37SBarry Smith   PetscFunctionReturn(0);
290415091d37SBarry Smith }
290515091d37SBarry Smith 
29064a2ae208SSatish Balay #undef __FUNCT__
29074a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2908dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
29094e2b4712SSatish Balay {
29104e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
29114e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
29126849ba73SBarry Smith   PetscErrorCode    ierr;
29135d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
29145d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2915d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2916d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
2917d9fead3dSBarry Smith   const PetscScalar *b;
29184e2b4712SSatish Balay 
29194e2b4712SSatish Balay   PetscFunctionBegin;
2920d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2922f1af5d2fSBarry Smith   t  = a->solve_work;
29234e2b4712SSatish Balay 
29244e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29254e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29264e2b4712SSatish Balay 
29274e2b4712SSatish Balay   /* forward solve the lower triangular */
29284e2b4712SSatish Balay   idx    = 2*(*r++);
2929f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
29304e2b4712SSatish Balay   for (i=1; i<n; i++) {
29314e2b4712SSatish Balay     v     = aa + 4*ai[i];
29324e2b4712SSatish Balay     vi    = aj + ai[i];
29334e2b4712SSatish Balay     nz    = diag[i] - ai[i];
29344e2b4712SSatish Balay     idx   = 2*(*r++);
2935f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
29364e2b4712SSatish Balay     while (nz--) {
29374e2b4712SSatish Balay       idx   = 2*(*vi++);
2938f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2939f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2940f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
29414e2b4712SSatish Balay       v += 4;
29424e2b4712SSatish Balay     }
29434e2b4712SSatish Balay     idx = 2*i;
2944f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
29454e2b4712SSatish Balay   }
29464e2b4712SSatish Balay   /* backward solve the upper triangular */
29474e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
29484e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
29494e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
29504e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
29514e2b4712SSatish Balay     idt  = 2*i;
2952f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
29534e2b4712SSatish Balay     while (nz--) {
29544e2b4712SSatish Balay       idx   = 2*(*vi++);
2955f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2956f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2957f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
29584e2b4712SSatish Balay       v += 4;
29594e2b4712SSatish Balay     }
29604e2b4712SSatish Balay     idc = 2*(*c--);
29614e2b4712SSatish Balay     v   = aa + 4*diag[i];
2962f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2963f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
29644e2b4712SSatish Balay   }
29654e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
29664e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2967d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2969dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
29704e2b4712SSatish Balay   PetscFunctionReturn(0);
29714e2b4712SSatish Balay }
29724e2b4712SSatish Balay 
297315091d37SBarry Smith /*
297415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
297515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
297615091d37SBarry Smith */
29774a2ae208SSatish Balay #undef __FUNCT__
29784a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2979dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
298015091d37SBarry Smith {
298115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2982690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2983dfbe8321SBarry Smith   PetscErrorCode    ierr;
2984690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2985d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2986d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
2987d9fead3dSBarry Smith   const PetscScalar *b;
2988690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
298915091d37SBarry Smith 
299015091d37SBarry Smith   PetscFunctionBegin;
2991d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29921ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
299315091d37SBarry Smith 
299415091d37SBarry Smith   /* forward solve the lower triangular */
299515091d37SBarry Smith   idx    = 0;
299615091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
299715091d37SBarry Smith   for (i=1; i<n; i++) {
299815091d37SBarry Smith     v     =  aa      + 4*ai[i];
299915091d37SBarry Smith     vi    =  aj      + ai[i];
300015091d37SBarry Smith     nz    =  diag[i] - ai[i];
300115091d37SBarry Smith     idx   +=  2;
3002f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
300315091d37SBarry Smith     while (nz--) {
300415091d37SBarry Smith       jdx   = 2*(*vi++);
300515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
3006f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3007f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
300815091d37SBarry Smith       v    += 4;
300915091d37SBarry Smith     }
3010f1af5d2fSBarry Smith     x[idx]   = s1;
3011f1af5d2fSBarry Smith     x[1+idx] = s2;
301215091d37SBarry Smith   }
301315091d37SBarry Smith   /* backward solve the upper triangular */
301415091d37SBarry Smith   for (i=n-1; i>=0; i--){
301515091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
301615091d37SBarry Smith     vi   = aj + diag[i] + 1;
301715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
301815091d37SBarry Smith     idt  = 2*i;
3019f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
302015091d37SBarry Smith     while (nz--) {
302115091d37SBarry Smith       idx   = 2*(*vi++);
302215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
3023f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3024f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
302515091d37SBarry Smith       v    += 4;
302615091d37SBarry Smith     }
302715091d37SBarry Smith     v        = aa +  4*diag[i];
3028f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
3029f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
303015091d37SBarry Smith   }
303115091d37SBarry Smith 
3032d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3034dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
303515091d37SBarry Smith   PetscFunctionReturn(0);
303615091d37SBarry Smith }
303715091d37SBarry Smith 
30384a2ae208SSatish Balay #undef __FUNCT__
30394a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3040dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
30414e2b4712SSatish Balay {
30424e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
30434e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
30446849ba73SBarry Smith   PetscErrorCode ierr;
30455d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
30465d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
30473f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
304887828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
30494e2b4712SSatish Balay 
30504e2b4712SSatish Balay   PetscFunctionBegin;
30514e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
30524e2b4712SSatish Balay 
30531ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30541ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3055f1af5d2fSBarry Smith   t  = a->solve_work;
30564e2b4712SSatish Balay 
30574e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30584e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
30594e2b4712SSatish Balay 
30604e2b4712SSatish Balay   /* forward solve the lower triangular */
3061f1af5d2fSBarry Smith   t[0] = b[*r++];
30624e2b4712SSatish Balay   for (i=1; i<n; i++) {
30634e2b4712SSatish Balay     v     = aa + ai[i];
30644e2b4712SSatish Balay     vi    = aj + ai[i];
30654e2b4712SSatish Balay     nz    = diag[i] - ai[i];
3066f1af5d2fSBarry Smith     s1  = b[*r++];
30674e2b4712SSatish Balay     while (nz--) {
3068f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
30694e2b4712SSatish Balay     }
3070f1af5d2fSBarry Smith     t[i] = s1;
30714e2b4712SSatish Balay   }
30724e2b4712SSatish Balay   /* backward solve the upper triangular */
30734e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30744e2b4712SSatish Balay     v    = aa + diag[i] + 1;
30754e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30764e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3077f1af5d2fSBarry Smith     s1 = t[i];
30784e2b4712SSatish Balay     while (nz--) {
3079f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
30804e2b4712SSatish Balay     }
3081f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
30824e2b4712SSatish Balay   }
30834e2b4712SSatish Balay 
30844e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30854e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30861ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30871ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3088dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
30894e2b4712SSatish Balay   PetscFunctionReturn(0);
30904e2b4712SSatish Balay }
309115091d37SBarry Smith /*
309215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
309315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
309415091d37SBarry Smith */
30954a2ae208SSatish Balay #undef __FUNCT__
30964a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3097dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
309815091d37SBarry Smith {
309915091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3100690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3101dfbe8321SBarry Smith   PetscErrorCode ierr;
3102690b6cddSBarry Smith   PetscInt       *diag = a->diag;
310315091d37SBarry Smith   MatScalar      *aa=a->a;
310487828ca2SBarry Smith   PetscScalar    *x,*b;
310587828ca2SBarry Smith   PetscScalar    s1,x1;
310615091d37SBarry Smith   MatScalar      *v;
3107690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
310815091d37SBarry Smith 
310915091d37SBarry Smith   PetscFunctionBegin;
31101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
31111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
311215091d37SBarry Smith 
311315091d37SBarry Smith   /* forward solve the lower triangular */
311415091d37SBarry Smith   idx    = 0;
311515091d37SBarry Smith   x[0]   = b[0];
311615091d37SBarry Smith   for (i=1; i<n; i++) {
311715091d37SBarry Smith     v     =  aa      + ai[i];
311815091d37SBarry Smith     vi    =  aj      + ai[i];
311915091d37SBarry Smith     nz    =  diag[i] - ai[i];
312015091d37SBarry Smith     idx   +=  1;
3121f1af5d2fSBarry Smith     s1  =  b[idx];
312215091d37SBarry Smith     while (nz--) {
312315091d37SBarry Smith       jdx   = *vi++;
312415091d37SBarry Smith       x1    = x[jdx];
3125f1af5d2fSBarry Smith       s1 -= v[0]*x1;
312615091d37SBarry Smith       v    += 1;
312715091d37SBarry Smith     }
3128f1af5d2fSBarry Smith     x[idx]   = s1;
312915091d37SBarry Smith   }
313015091d37SBarry Smith   /* backward solve the upper triangular */
313115091d37SBarry Smith   for (i=n-1; i>=0; i--){
313215091d37SBarry Smith     v    = aa + diag[i] + 1;
313315091d37SBarry Smith     vi   = aj + diag[i] + 1;
313415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
313515091d37SBarry Smith     idt  = i;
3136f1af5d2fSBarry Smith     s1 = x[idt];
313715091d37SBarry Smith     while (nz--) {
313815091d37SBarry Smith       idx   = *vi++;
313915091d37SBarry Smith       x1    = x[idx];
3140f1af5d2fSBarry Smith       s1 -= v[0]*x1;
314115091d37SBarry Smith       v    += 1;
314215091d37SBarry Smith     }
314315091d37SBarry Smith     v        = aa +  diag[i];
3144f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
314515091d37SBarry Smith   }
31461ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
31471ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3148dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
314915091d37SBarry Smith   PetscFunctionReturn(0);
315015091d37SBarry Smith }
31514e2b4712SSatish Balay 
31524e2b4712SSatish Balay /* ----------------------------------------------------------------*/
31536bce7ff8SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
31546bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
31556bce7ff8SHong Zhang 
3156*84a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
31576bce7ff8SHong Zhang #undef __FUNCT__
31586bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
31596bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
31606bce7ff8SHong Zhang {
31616bce7ff8SHong Zhang   Mat            C=B;
31626bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
31636bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
31646bce7ff8SHong Zhang   PetscErrorCode ierr;
31656bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
31666bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
31676bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3168914a18a2SHong Zhang   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3169914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3170914a18a2SHong Zhang   MatScalar      *v_work;
31716bce7ff8SHong Zhang 
31726bce7ff8SHong Zhang   PetscFunctionBegin;
31736bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
31746bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3175914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3176914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
31776bce7ff8SHong Zhang   ics  = ic;
31786bce7ff8SHong Zhang 
3179914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
3180914a18a2SHong Zhang   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3181914a18a2SHong Zhang   multiplier = v_work + bs;
3182914a18a2SHong Zhang   v_pivots   = (PetscInt*)(multiplier + bs2);
3183914a18a2SHong Zhang 
31846bce7ff8SHong Zhang   for (i=0; i<n; i++){
31856bce7ff8SHong Zhang     /* zero rtmp */
31866bce7ff8SHong Zhang     /* L part */
31876bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
31886bce7ff8SHong Zhang     bjtmp = bj + bi[i];
3189914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3190914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3191914a18a2SHong Zhang     }
31926bce7ff8SHong Zhang 
31936bce7ff8SHong Zhang     /* U part */
31946bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
31956bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
3196914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3197914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3198914a18a2SHong Zhang     }
31996bce7ff8SHong Zhang 
32006bce7ff8SHong Zhang     /* load in initial (unfactored row) */
32016bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
32026bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
3203914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
32046bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3205914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
32066bce7ff8SHong Zhang     }
32076bce7ff8SHong Zhang 
32086bce7ff8SHong Zhang     /* elimination */
32096bce7ff8SHong Zhang     bjtmp = bj + bi[i];
32106bce7ff8SHong Zhang     row   = *bjtmp++;
32116bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
32126bce7ff8SHong Zhang     k   = 0;
32136bce7ff8SHong Zhang     while  (k < nzL) {
3214914a18a2SHong Zhang       pc = rtmp + bs2*row;
3215914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3216914a18a2SHong Zhang       if (flg) {
3217914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
3218914a18a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
32196bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3220914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
32216bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3222914a18a2SHong Zhang         for (j=0; j<nz; j++) {
3223914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3224914a18a2SHong Zhang         }
32256bce7ff8SHong Zhang         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
32266bce7ff8SHong Zhang       }
32276bce7ff8SHong Zhang       row = *bjtmp++; k++;
32286bce7ff8SHong Zhang     }
32296bce7ff8SHong Zhang 
32306bce7ff8SHong Zhang     /* finished row so stick it into b->a */
32316bce7ff8SHong Zhang     /* L part */
3232914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
32336bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
32346bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
32356bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3236914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
32376bce7ff8SHong Zhang     }
32386bce7ff8SHong Zhang 
32396bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
3240914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
32416bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
3242914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3243914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3244914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
32456bce7ff8SHong Zhang 
32466bce7ff8SHong Zhang     /* U part */
3247914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
32486bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
32496bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3250914a18a2SHong Zhang     for (j=0; j<nz; j++){
3251914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3252914a18a2SHong Zhang     }
32536bce7ff8SHong Zhang   }
32546bce7ff8SHong Zhang 
32556bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
32566bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
32576bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
32586bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3259*84a281e5SHong Zhang   if (bs == 5){
3260*84a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3261*84a281e5SHong Zhang   } else {
3262*84a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3263*84a281e5SHong Zhang   }
32646bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
3265914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
32666bce7ff8SHong Zhang   PetscFunctionReturn(0);
32676bce7ff8SHong Zhang }
32686bce7ff8SHong Zhang 
32696bce7ff8SHong Zhang /*
32706bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
32716bce7ff8SHong Zhang    Factored arrays bj and ba are stored as
32726bce7ff8SHong Zhang      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
32736bce7ff8SHong Zhang 
32746bce7ff8SHong Zhang    bi=fact->i is an array of size 2n+2, in which
32756bce7ff8SHong Zhang    bi+
32766bce7ff8SHong Zhang      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
32776bce7ff8SHong Zhang      bi[n]      ->  end of L(n-1,:)+1
32786bce7ff8SHong Zhang      bi[n+1]    ->  1st entry of U(n-1,:)
32796bce7ff8SHong Zhang      bi[2n-i]   ->  1st entry of U(i,:)
32806bce7ff8SHong Zhang      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
32816bce7ff8SHong Zhang      bi[2n]     ->  end of U(0,:)+1
32826bce7ff8SHong Zhang 
32836bce7ff8SHong Zhang    U(i,:) contains diag[i] as its last entry, i.e.,
32846bce7ff8SHong Zhang     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
32856bce7ff8SHong Zhang */
32866bce7ff8SHong Zhang #undef __FUNCT__
32876bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
32886bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
32896bce7ff8SHong Zhang {
32906bce7ff8SHong Zhang 
32916bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
32926bce7ff8SHong Zhang   PetscErrorCode     ierr;
3293914a18a2SHong Zhang   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
32946bce7ff8SHong Zhang   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
32956bce7ff8SHong Zhang 
32966bce7ff8SHong Zhang   PetscFunctionBegin;
32976bce7ff8SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
32986bce7ff8SHong Zhang   b     = (Mat_SeqBAIJ*)(fact)->data;
3299914a18a2SHong Zhang   bdiag = b->diag;
33006bce7ff8SHong Zhang 
33016bce7ff8SHong Zhang   /* replace matrix arrays with single allocations, then reset values */
33026bce7ff8SHong Zhang   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
33036bce7ff8SHong Zhang 
33046bce7ff8SHong Zhang   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
33056bce7ff8SHong Zhang   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
33066bce7ff8SHong Zhang   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
33076bce7ff8SHong Zhang   b->singlemalloc = PETSC_FALSE;
33086bce7ff8SHong Zhang   if (mbs > 0) {
33096bce7ff8SHong Zhang     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
33106bce7ff8SHong Zhang   }
33116bce7ff8SHong Zhang 
33126bce7ff8SHong Zhang   /* set bi and bj with new data structure */
33136bce7ff8SHong Zhang   bi = b->i;
33146bce7ff8SHong Zhang   bj = b->j;
33156bce7ff8SHong Zhang 
33166bce7ff8SHong Zhang   /* L part */
33176bce7ff8SHong Zhang   bi[0] = 0;
33186bce7ff8SHong Zhang   for (i=0; i<mbs; i++){
33196bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
3320914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
33216bce7ff8SHong Zhang     aj = a->j + ai[i];
33226bce7ff8SHong Zhang     for (j=0; j<nz; j++){
33236bce7ff8SHong Zhang       *bj = aj[j]; bj++;
33246bce7ff8SHong Zhang     }
33256bce7ff8SHong Zhang   }
33266bce7ff8SHong Zhang 
33276bce7ff8SHong Zhang   /* U part */
33286bce7ff8SHong Zhang   bi[mbs+1] = bi[mbs];
33296bce7ff8SHong Zhang   for (i=mbs-1; i>=0; i--){
33306bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
33316bce7ff8SHong Zhang     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3332914a18a2SHong Zhang     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
33336bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
33346bce7ff8SHong Zhang     for (j=0; j<nz; j++){
33356bce7ff8SHong Zhang       *bj = aj[j]; bj++;
33366bce7ff8SHong Zhang     }
33376bce7ff8SHong Zhang     /* diag[i] */
33386bce7ff8SHong Zhang     *bj = i; bj++;
33396bce7ff8SHong Zhang     bdiag[i] = bi[2*mbs-i+1]-1;
33406bce7ff8SHong Zhang   }
33416bce7ff8SHong Zhang   PetscFunctionReturn(0);
33426bce7ff8SHong Zhang }
33436bce7ff8SHong Zhang 
33444e2b4712SSatish Balay /*
33454e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
33464e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
33474e2b4712SSatish Balay    Not a good example of code reuse.
33484e2b4712SSatish Balay */
3349435faa5fSBarry Smith 
33504a2ae208SSatish Balay #undef __FUNCT__
33514a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
33520481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
33534e2b4712SSatish Balay {
33544e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
33554e2b4712SSatish Balay   IS             isicol;
33566849ba73SBarry Smith   PetscErrorCode ierr;
33575d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
33585d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3359a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3360d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
336141df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
3362329f5518SBarry Smith   PetscReal      f;
33634e2b4712SSatish Balay 
33644e2b4712SSatish Balay   PetscFunctionBegin;
33656bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
33666bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
33676bce7ff8SHong Zhang 
3368435faa5fSBarry Smith   f             = info->fill;
3369690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
3370690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
33714c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3372667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3373667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
33747d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
3375309c388cSBarry Smith 
337641df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
33776bce7ff8SHong Zhang 
33786bce7ff8SHong Zhang     PetscTruth newdatastruct=PETSC_FALSE;
33796bce7ff8SHong Zhang     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
33806bce7ff8SHong Zhang     if (newdatastruct){
33816bce7ff8SHong Zhang       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
33826bce7ff8SHong Zhang       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
33836bce7ff8SHong Zhang     } else {
3384719d5645SBarry Smith       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
33856bce7ff8SHong Zhang       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
33866bce7ff8SHong Zhang     }
33876bce7ff8SHong Zhang 
3388719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
3389719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
3390bb3d539aSBarry Smith     b->row       = isrow;
3391bb3d539aSBarry Smith     b->col       = iscol;
3392bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3393bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3394bb3d539aSBarry Smith     b->icol      = isicol;
3395bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3396719d5645SBarry Smith     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
33976bce7ff8SHong Zhang     PetscFunctionReturn(0);
33986bce7ff8SHong Zhang   }
33996bce7ff8SHong Zhang 
34006bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
34014e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34024e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
34034e2b4712SSatish Balay 
34044e2b4712SSatish Balay     /* get new row pointers */
3405690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
34064e2b4712SSatish Balay     ainew[0] = 0;
34074e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
3408690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
3409690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
34104e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
3411690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
34124e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3413690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
34144e2b4712SSatish Balay     /* im is level for each filled value */
3415690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
34164e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3417690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
34184e2b4712SSatish Balay     dloc[0]  = 0;
34194e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3420435faa5fSBarry Smith 
3421435faa5fSBarry Smith       /* copy prow into linked list */
34224e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
34233b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
34244e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
34254e2b4712SSatish Balay       fill[n]    = n;
3426435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
34274e2b4712SSatish Balay       while (nz--) {
34284e2b4712SSatish Balay 	fm  = n;
34294e2b4712SSatish Balay 	idx = ic[*xi++];
34304e2b4712SSatish Balay 	do {
34314e2b4712SSatish Balay 	  m  = fm;
34324e2b4712SSatish Balay 	  fm = fill[m];
34334e2b4712SSatish Balay 	} while (fm < idx);
34344e2b4712SSatish Balay 	fill[m]   = idx;
34354e2b4712SSatish Balay 	fill[idx] = fm;
34364e2b4712SSatish Balay 	im[idx]   = 0;
34374e2b4712SSatish Balay       }
3438435faa5fSBarry Smith 
3439435faa5fSBarry Smith       /* make sure diagonal entry is included */
3440435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3441435faa5fSBarry Smith 	fm = n;
3442435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3443435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3444435faa5fSBarry Smith 	fill[fm]   = prow;
3445435faa5fSBarry Smith 	im[prow]   = 0;
3446435faa5fSBarry Smith 	nzf++;
3447335d9088SBarry Smith 	dcount++;
3448435faa5fSBarry Smith       }
3449435faa5fSBarry Smith 
34504e2b4712SSatish Balay       nzi = 0;
34514e2b4712SSatish Balay       row = fill[n];
34524e2b4712SSatish Balay       while (row < prow) {
34534e2b4712SSatish Balay 	incrlev = im[row] + 1;
34544e2b4712SSatish Balay 	nz      = dloc[row];
3455435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
34564e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
34574e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
34584e2b4712SSatish Balay 	fm      = row;
34594e2b4712SSatish Balay 	while (nnz-- > 0) {
34604e2b4712SSatish Balay 	  idx = *xi++;
34614e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
34624e2b4712SSatish Balay 	    flev++;
34634e2b4712SSatish Balay 	    continue;
34644e2b4712SSatish Balay 	  }
34654e2b4712SSatish Balay 	  do {
34664e2b4712SSatish Balay 	    m  = fm;
34674e2b4712SSatish Balay 	    fm = fill[m];
34684e2b4712SSatish Balay 	  } while (fm < idx);
34694e2b4712SSatish Balay 	  if (fm != idx) {
34704e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
34714e2b4712SSatish Balay 	    fill[m]   = idx;
34724e2b4712SSatish Balay 	    fill[idx] = fm;
34734e2b4712SSatish Balay 	    fm        = idx;
34744e2b4712SSatish Balay 	    nzf++;
3475ecf371e4SBarry Smith 	  } else {
34764e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
34774e2b4712SSatish Balay 	  }
34784e2b4712SSatish Balay 	  flev++;
34794e2b4712SSatish Balay 	}
34804e2b4712SSatish Balay 	row = fill[row];
34814e2b4712SSatish Balay 	nzi++;
34824e2b4712SSatish Balay       }
34834e2b4712SSatish Balay       /* copy new filled row into permanent storage */
34844e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
34854e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3486ecf371e4SBarry Smith 
3487ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3488ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3489ecf371e4SBarry Smith 	/* just double the memory each time */
3490690b6cddSBarry Smith 	PetscInt maxadd = jmax;
3491ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
34924e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
34934e2b4712SSatish Balay 	jmax += maxadd;
3494ecf371e4SBarry Smith 
3495ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
34965d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
34975d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3498606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
34995d0c19d7SBarry Smith 	ajnew = xitmp;
35005d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
35015d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3502606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
35035d0c19d7SBarry Smith 	ajfill = xitmp;
3504eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
35054e2b4712SSatish Balay       }
35065d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
35074e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
35084e2b4712SSatish Balay       dloc[prow]  = nzi;
35094e2b4712SSatish Balay       fm          = fill[n];
35104e2b4712SSatish Balay       while (nzf--) {
35115d0c19d7SBarry Smith 	*xitmp++ = fm;
35124e2b4712SSatish Balay 	*flev++ = im[fm];
35134e2b4712SSatish Balay 	fm      = fill[fm];
35144e2b4712SSatish Balay       }
3515435faa5fSBarry Smith       /* make sure row has diagonal entry */
3516435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
351777431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
35182401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3519435faa5fSBarry Smith       }
35204e2b4712SSatish Balay     }
3521606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
35224e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
35234e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3524606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3525606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
35264e2b4712SSatish Balay 
35276cf91177SBarry Smith #if defined(PETSC_USE_INFO)
35284e2b4712SSatish Balay     {
3529329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3530ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3531ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3532ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3533ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3534335d9088SBarry Smith       if (diagonal_fill) {
3535ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3536335d9088SBarry Smith       }
35374e2b4712SSatish Balay     }
353863ba0a88SBarry Smith #endif
35394e2b4712SSatish Balay 
35404e2b4712SSatish Balay     /* put together the new matrix */
3541719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3542719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3543719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
3544e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
3545e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
35467c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
3547a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
35484e2b4712SSatish Balay     b->j          = ajnew;
35494e2b4712SSatish Balay     b->i          = ainew;
35504e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
35514e2b4712SSatish Balay     b->diag       = dloc;
35524e2b4712SSatish Balay     b->ilen       = 0;
35534e2b4712SSatish Balay     b->imax       = 0;
35544e2b4712SSatish Balay     b->row        = isrow;
35554e2b4712SSatish Balay     b->col        = iscol;
3556bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3557c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3558c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3559e51c0b9cSSatish Balay     b->icol       = isicol;
356087828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
35614e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
35624e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
3563719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
35644e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
35654e2b4712SSatish Balay 
3566719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
3567719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
3568719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
35696bce7ff8SHong Zhang 
357041df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
35718661488fSKris Buschelman   PetscFunctionReturn(0);
35728661488fSKris Buschelman }
35738661488fSKris Buschelman 
3574732ee342SKris Buschelman #undef __FUNCT__
35757e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3576dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
35777e7071cdSKris Buschelman {
357812272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
357912272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
35805a9542e3SKris Buschelman   PetscFunctionBegin;
35817cf1b8d3SKris Buschelman   /* Undo Column scaling */
35827cf1b8d3SKris Buschelman /*    while (nz--) { */
35837cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
35847cf1b8d3SKris Buschelman /*    } */
3585c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3586c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
35877cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
35887cf1b8d3SKris Buschelman }
35897cf1b8d3SKris Buschelman 
35907cf1b8d3SKris Buschelman #undef __FUNCT__
35917cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3592dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
35937cf1b8d3SKris Buschelman {
35947cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3595b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
35962aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
35975a9542e3SKris Buschelman   PetscFunctionBegin;
35980b9da03eSKris Buschelman   /* Is this really necessary? */
359920235379SKris Buschelman   while (nz--) {
36000b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
36017e7071cdSKris Buschelman   }
3602c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
36037e7071cdSKris Buschelman   PetscFunctionReturn(0);
36047e7071cdSKris Buschelman }
36057e7071cdSKris Buschelman 
3606732ee342SKris Buschelman 
3607