xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision c60f02097ec1bad1d486ea4e6635560ad6843df9)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
34e2b4712SSatish Balay /*
44e2b4712SSatish Balay     Factorization code for BAIJ format.
54e2b4712SSatish Balay */
64e2b4712SSatish Balay 
77c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
8*c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
94e2b4712SSatish Balay 
104a2ae208SSatish Balay #undef __FUNCT__
114a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
12dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
13f1af5d2fSBarry Smith {
14f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
15dfbe8321SBarry Smith   PetscErrorCode ierr;
16690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
17690b6cddSBarry Smith   PetscInt       *diag = a->diag;
18f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
1987828ca2SBarry Smith   PetscScalar    s1,*x,*b;
20f1af5d2fSBarry Smith 
21f1af5d2fSBarry Smith   PetscFunctionBegin;
22ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
231ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
241ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25f1af5d2fSBarry Smith 
26f1af5d2fSBarry Smith   /* forward solve the U^T */
27f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith     v     = aa + diag[i];
30f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
31ef66eb69SBarry Smith     s1    = (*v++)*x[i];
32f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
33f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
34f1af5d2fSBarry Smith     while (nz--) {
35f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
36f1af5d2fSBarry Smith     }
37f1af5d2fSBarry Smith     x[i]   = s1;
38f1af5d2fSBarry Smith   }
39f1af5d2fSBarry Smith   /* backward solve the L^T */
40f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
41f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
42f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
43f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
44f1af5d2fSBarry Smith     s1   = x[i];
45f1af5d2fSBarry Smith     while (nz--) {
46f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
47f1af5d2fSBarry Smith     }
48f1af5d2fSBarry Smith   }
491ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
52f1af5d2fSBarry Smith   PetscFunctionReturn(0);
53f1af5d2fSBarry Smith }
54f1af5d2fSBarry Smith 
554a2ae208SSatish Balay #undef __FUNCT__
564a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
57dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
58f1af5d2fSBarry Smith {
59f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
60dfbe8321SBarry Smith   PetscErrorCode ierr;
61690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
62690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
63f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6487828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6587828ca2SBarry Smith   PetscScalar    *x,*b;
66f1af5d2fSBarry Smith 
67f1af5d2fSBarry Smith   PetscFunctionBegin;
68ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
691ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
71f1af5d2fSBarry Smith 
72f1af5d2fSBarry Smith   /* forward solve the U^T */
73f1af5d2fSBarry Smith   idx = 0;
74f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
75f1af5d2fSBarry Smith 
76f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
77f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
78ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
79f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
80f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
81f1af5d2fSBarry Smith     v += 4;
82f1af5d2fSBarry Smith 
83f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
84f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
85f1af5d2fSBarry Smith     while (nz--) {
86f1af5d2fSBarry Smith       oidx = 2*(*vi++);
87f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
88f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
89f1af5d2fSBarry Smith       v  += 4;
90f1af5d2fSBarry Smith     }
91f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
92f1af5d2fSBarry Smith     idx += 2;
93f1af5d2fSBarry Smith   }
94f1af5d2fSBarry Smith   /* backward solve the L^T */
95f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
96f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
97f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
98f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
99f1af5d2fSBarry Smith     idt  = 2*i;
100f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
101f1af5d2fSBarry Smith     while (nz--) {
102f1af5d2fSBarry Smith       idx   = 2*(*vi--);
103f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
104f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
105f1af5d2fSBarry Smith       v -= 4;
106f1af5d2fSBarry Smith     }
107f1af5d2fSBarry Smith   }
1081ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1091ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
110dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
111f1af5d2fSBarry Smith   PetscFunctionReturn(0);
112f1af5d2fSBarry Smith }
113f1af5d2fSBarry Smith 
1144a2ae208SSatish Balay #undef __FUNCT__
1154a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
116dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
117f1af5d2fSBarry Smith {
118f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
119dfbe8321SBarry Smith   PetscErrorCode ierr;
120690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
121690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
122f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12387828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
12487828ca2SBarry Smith   PetscScalar    *x,*b;
125f1af5d2fSBarry Smith 
126f1af5d2fSBarry Smith   PetscFunctionBegin;
127ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1281ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1291ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
130f1af5d2fSBarry Smith 
131f1af5d2fSBarry Smith   /* forward solve the U^T */
132f1af5d2fSBarry Smith   idx = 0;
133f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
134f1af5d2fSBarry Smith 
135f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
136f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
137ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
138f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
139f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
140f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
141f1af5d2fSBarry Smith     v += 9;
142f1af5d2fSBarry Smith 
143f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
144f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
145f1af5d2fSBarry Smith     while (nz--) {
146f1af5d2fSBarry Smith       oidx = 3*(*vi++);
147f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
148f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
149f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
150f1af5d2fSBarry Smith       v  += 9;
151f1af5d2fSBarry Smith     }
152f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
153f1af5d2fSBarry Smith     idx += 3;
154f1af5d2fSBarry Smith   }
155f1af5d2fSBarry Smith   /* backward solve the L^T */
156f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
157f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
158f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
159f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
160f1af5d2fSBarry Smith     idt  = 3*i;
161f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
162f1af5d2fSBarry Smith     while (nz--) {
163f1af5d2fSBarry Smith       idx   = 3*(*vi--);
164f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
165f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
166f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
167f1af5d2fSBarry Smith       v -= 9;
168f1af5d2fSBarry Smith     }
169f1af5d2fSBarry Smith   }
1701ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1711ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
172dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
173f1af5d2fSBarry Smith   PetscFunctionReturn(0);
174f1af5d2fSBarry Smith }
175f1af5d2fSBarry Smith 
1764a2ae208SSatish Balay #undef __FUNCT__
1774a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
178dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
179f1af5d2fSBarry Smith {
180f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181dfbe8321SBarry Smith   PetscErrorCode ierr;
182690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
184f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
18687828ca2SBarry Smith   PetscScalar    *x,*b;
187f1af5d2fSBarry Smith 
188f1af5d2fSBarry Smith   PetscFunctionBegin;
189ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192f1af5d2fSBarry Smith 
193f1af5d2fSBarry Smith   /* forward solve the U^T */
194f1af5d2fSBarry Smith   idx = 0;
195f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
196f1af5d2fSBarry Smith 
197f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
198f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
199ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
200f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
201f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
202f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
203f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
204f1af5d2fSBarry Smith     v += 16;
205f1af5d2fSBarry Smith 
206f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
207f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
208f1af5d2fSBarry Smith     while (nz--) {
209f1af5d2fSBarry Smith       oidx = 4*(*vi++);
210f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
211f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
212f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
213f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
214f1af5d2fSBarry Smith       v  += 16;
215f1af5d2fSBarry Smith     }
216f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
217f1af5d2fSBarry Smith     idx += 4;
218f1af5d2fSBarry Smith   }
219f1af5d2fSBarry Smith   /* backward solve the L^T */
220f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
221f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
222f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
223f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
224f1af5d2fSBarry Smith     idt  = 4*i;
225f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
226f1af5d2fSBarry Smith     while (nz--) {
227f1af5d2fSBarry Smith       idx   = 4*(*vi--);
228f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
229f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
230f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
231f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
232f1af5d2fSBarry Smith       v -= 16;
233f1af5d2fSBarry Smith     }
234f1af5d2fSBarry Smith   }
2351ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2361ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
237dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
238f1af5d2fSBarry Smith   PetscFunctionReturn(0);
239f1af5d2fSBarry Smith }
240f1af5d2fSBarry Smith 
2414a2ae208SSatish Balay #undef __FUNCT__
2424a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
243dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
244f1af5d2fSBarry Smith {
245f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
246dfbe8321SBarry Smith   PetscErrorCode ierr;
247690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
248690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
249f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
25087828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
25187828ca2SBarry Smith   PetscScalar    *x,*b;
252f1af5d2fSBarry Smith 
253f1af5d2fSBarry Smith   PetscFunctionBegin;
254ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2551ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2561ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
257f1af5d2fSBarry Smith 
258f1af5d2fSBarry Smith   /* forward solve the U^T */
259f1af5d2fSBarry Smith   idx = 0;
260f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
261f1af5d2fSBarry Smith 
262f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
263f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
264ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
265f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
266f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
267f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
268f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
269f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
270f1af5d2fSBarry Smith     v += 25;
271f1af5d2fSBarry Smith 
272f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
273f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
274f1af5d2fSBarry Smith     while (nz--) {
275f1af5d2fSBarry Smith       oidx = 5*(*vi++);
276f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
277f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
278f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
279f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
280f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
281f1af5d2fSBarry Smith       v  += 25;
282f1af5d2fSBarry Smith     }
283f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
284f1af5d2fSBarry Smith     idx += 5;
285f1af5d2fSBarry Smith   }
286f1af5d2fSBarry Smith   /* backward solve the L^T */
287f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
288f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
289f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
290f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
291f1af5d2fSBarry Smith     idt  = 5*i;
292f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
293f1af5d2fSBarry Smith     while (nz--) {
294f1af5d2fSBarry Smith       idx   = 5*(*vi--);
295f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
296f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
297f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
298f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
299f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
300f1af5d2fSBarry Smith       v -= 25;
301f1af5d2fSBarry Smith     }
302f1af5d2fSBarry Smith   }
3031ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
305dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
306f1af5d2fSBarry Smith   PetscFunctionReturn(0);
307f1af5d2fSBarry Smith }
308f1af5d2fSBarry Smith 
3094a2ae208SSatish Balay #undef __FUNCT__
3104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
311dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
312f1af5d2fSBarry Smith {
313f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
314dfbe8321SBarry Smith   PetscErrorCode ierr;
315690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
316690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
317f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
31887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
31987828ca2SBarry Smith   PetscScalar    *x,*b;
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith   PetscFunctionBegin;
322ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3231ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3241ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
325f1af5d2fSBarry Smith 
326f1af5d2fSBarry Smith   /* forward solve the U^T */
327f1af5d2fSBarry Smith   idx = 0;
328f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
329f1af5d2fSBarry Smith 
330f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
331f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
332ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
333ef66eb69SBarry Smith     x6    = x[5+idx];
334f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
335f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
336f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
337f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
338f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
339f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
340f1af5d2fSBarry Smith     v += 36;
341f1af5d2fSBarry Smith 
342f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
343f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
344f1af5d2fSBarry Smith     while (nz--) {
345f1af5d2fSBarry Smith       oidx = 6*(*vi++);
346f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
347f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
348f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
349f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
350f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
351f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
352f1af5d2fSBarry Smith       v  += 36;
353f1af5d2fSBarry Smith     }
354f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
355f1af5d2fSBarry Smith     x[5+idx] = s6;
356f1af5d2fSBarry Smith     idx += 6;
357f1af5d2fSBarry Smith   }
358f1af5d2fSBarry Smith   /* backward solve the L^T */
359f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
360f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
361f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
362f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
363f1af5d2fSBarry Smith     idt  = 6*i;
364f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
365f1af5d2fSBarry Smith     s6 = x[5+idt];
366f1af5d2fSBarry Smith     while (nz--) {
367f1af5d2fSBarry Smith       idx   = 6*(*vi--);
368f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
369f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
370f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
371f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
372f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
373f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
374f1af5d2fSBarry Smith       v -= 36;
375f1af5d2fSBarry Smith     }
376f1af5d2fSBarry Smith   }
3771ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3781ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
379dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
380f1af5d2fSBarry Smith   PetscFunctionReturn(0);
381f1af5d2fSBarry Smith }
382f1af5d2fSBarry Smith 
3834a2ae208SSatish Balay #undef __FUNCT__
3844a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
385dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
386f1af5d2fSBarry Smith {
387f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
388dfbe8321SBarry Smith   PetscErrorCode ierr;
389690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
390690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
391f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
39287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
39387828ca2SBarry Smith   PetscScalar    *x,*b;
394f1af5d2fSBarry Smith 
395f1af5d2fSBarry Smith   PetscFunctionBegin;
396ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3971ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3981ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
399f1af5d2fSBarry Smith 
400f1af5d2fSBarry Smith   /* forward solve the U^T */
401f1af5d2fSBarry Smith   idx = 0;
402f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
403f1af5d2fSBarry Smith 
404f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
405f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
406ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
407ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
408f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
409f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
410f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
411f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
412f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
413f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
414f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
415f1af5d2fSBarry Smith     v += 49;
416f1af5d2fSBarry Smith 
417f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
418f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
419f1af5d2fSBarry Smith     while (nz--) {
420f1af5d2fSBarry Smith       oidx = 7*(*vi++);
421f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
422f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
423f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
424f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
425f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
426f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
427f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
428f1af5d2fSBarry Smith       v  += 49;
429f1af5d2fSBarry Smith     }
430f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
431f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
432f1af5d2fSBarry Smith     idx += 7;
433f1af5d2fSBarry Smith   }
434f1af5d2fSBarry Smith   /* backward solve the L^T */
435f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
436f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
437f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
438f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
439f1af5d2fSBarry Smith     idt  = 7*i;
440f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
441f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
442f1af5d2fSBarry Smith     while (nz--) {
443f1af5d2fSBarry Smith       idx   = 7*(*vi--);
444f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
445f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
446f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
447f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
448f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
449f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
450f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
451f1af5d2fSBarry Smith       v -= 49;
452f1af5d2fSBarry Smith     }
453f1af5d2fSBarry Smith   }
4541ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4551ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
456dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
457f1af5d2fSBarry Smith   PetscFunctionReturn(0);
458f1af5d2fSBarry Smith }
459f1af5d2fSBarry Smith 
460f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
4614a2ae208SSatish Balay #undef __FUNCT__
4624a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
463dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
464f1af5d2fSBarry Smith {
465f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
466f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
4676849ba73SBarry Smith   PetscErrorCode ierr;
4685d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
4695d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
470690b6cddSBarry Smith   PetscInt       *diag = a->diag;
471f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
47287828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
473f1af5d2fSBarry Smith 
474f1af5d2fSBarry Smith   PetscFunctionBegin;
4751ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
477f1af5d2fSBarry Smith   t  = a->solve_work;
478f1af5d2fSBarry Smith 
479f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
480f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
481f1af5d2fSBarry Smith 
482f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
483f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
484f1af5d2fSBarry Smith     t[i] = b[c[i]];
485f1af5d2fSBarry Smith   }
486f1af5d2fSBarry Smith 
487f1af5d2fSBarry Smith   /* forward solve the U^T */
488f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
489f1af5d2fSBarry Smith 
490f1af5d2fSBarry Smith     v     = aa + diag[i];
491f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
492f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
493f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
494f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
495f1af5d2fSBarry Smith     while (nz--) {
496f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
497f1af5d2fSBarry Smith     }
498f1af5d2fSBarry Smith     t[i]   = s1;
499f1af5d2fSBarry Smith   }
500f1af5d2fSBarry Smith   /* backward solve the L^T */
501f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
502f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
503f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
504f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
505f1af5d2fSBarry Smith     s1   = t[i];
506f1af5d2fSBarry Smith     while (nz--) {
507f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
508f1af5d2fSBarry Smith     }
509f1af5d2fSBarry Smith   }
510f1af5d2fSBarry Smith 
511f1af5d2fSBarry Smith   /* copy t into x according to permutation */
512f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
513f1af5d2fSBarry Smith     x[r[i]]   = t[i];
514f1af5d2fSBarry Smith   }
515f1af5d2fSBarry Smith 
516f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
517f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5181ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5191ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
520dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
521f1af5d2fSBarry Smith   PetscFunctionReturn(0);
522f1af5d2fSBarry Smith }
523f1af5d2fSBarry Smith 
5244a2ae208SSatish Balay #undef __FUNCT__
5254a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
526dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
527f1af5d2fSBarry Smith {
528f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
529f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5306849ba73SBarry Smith   PetscErrorCode ierr;
5315d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5325d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
533690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
534f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53587828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
53687828ca2SBarry Smith   PetscScalar    *x,*b,*t;
537f1af5d2fSBarry Smith 
538f1af5d2fSBarry Smith   PetscFunctionBegin;
5391ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
541f1af5d2fSBarry Smith   t  = a->solve_work;
542f1af5d2fSBarry Smith 
543f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
544f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
547f1af5d2fSBarry Smith   ii = 0;
548f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
549f1af5d2fSBarry Smith     ic      = 2*c[i];
550f1af5d2fSBarry Smith     t[ii]   = b[ic];
551f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
552f1af5d2fSBarry Smith     ii += 2;
553f1af5d2fSBarry Smith   }
554f1af5d2fSBarry Smith 
555f1af5d2fSBarry Smith   /* forward solve the U^T */
556f1af5d2fSBarry Smith   idx = 0;
557f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
558f1af5d2fSBarry Smith 
559f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
560f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
561f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
562f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
563f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
564f1af5d2fSBarry Smith     v += 4;
565f1af5d2fSBarry Smith 
566f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
567f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
568f1af5d2fSBarry Smith     while (nz--) {
569f1af5d2fSBarry Smith       oidx = 2*(*vi++);
570f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
571f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
572f1af5d2fSBarry Smith       v  += 4;
573f1af5d2fSBarry Smith     }
574f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
575f1af5d2fSBarry Smith     idx += 2;
576f1af5d2fSBarry Smith   }
577f1af5d2fSBarry Smith   /* backward solve the L^T */
578f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
579f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
580f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
581f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
582f1af5d2fSBarry Smith     idt  = 2*i;
583f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
584f1af5d2fSBarry Smith     while (nz--) {
585f1af5d2fSBarry Smith       idx   = 2*(*vi--);
586f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
587f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
588f1af5d2fSBarry Smith       v -= 4;
589f1af5d2fSBarry Smith     }
590f1af5d2fSBarry Smith   }
591f1af5d2fSBarry Smith 
592f1af5d2fSBarry Smith   /* copy t into x according to permutation */
593f1af5d2fSBarry Smith   ii = 0;
594f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
595f1af5d2fSBarry Smith     ir      = 2*r[i];
596f1af5d2fSBarry Smith     x[ir]   = t[ii];
597f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
598f1af5d2fSBarry Smith     ii += 2;
599f1af5d2fSBarry Smith   }
600f1af5d2fSBarry Smith 
601f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
602f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6031ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
605dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
606f1af5d2fSBarry Smith   PetscFunctionReturn(0);
607f1af5d2fSBarry Smith }
608f1af5d2fSBarry Smith 
6094a2ae208SSatish Balay #undef __FUNCT__
6104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
611dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
612f1af5d2fSBarry Smith {
613f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
614f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6156849ba73SBarry Smith   PetscErrorCode ierr;
6165d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6175d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
618690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
619f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
62087828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
62187828ca2SBarry Smith   PetscScalar    *x,*b,*t;
622f1af5d2fSBarry Smith 
623f1af5d2fSBarry Smith   PetscFunctionBegin;
6241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
626f1af5d2fSBarry Smith   t  = a->solve_work;
627f1af5d2fSBarry Smith 
628f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
629f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
630f1af5d2fSBarry Smith 
631f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
632f1af5d2fSBarry Smith   ii = 0;
633f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
634f1af5d2fSBarry Smith     ic      = 3*c[i];
635f1af5d2fSBarry Smith     t[ii]   = b[ic];
636f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
637f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
638f1af5d2fSBarry Smith     ii += 3;
639f1af5d2fSBarry Smith   }
640f1af5d2fSBarry Smith 
641f1af5d2fSBarry Smith   /* forward solve the U^T */
642f1af5d2fSBarry Smith   idx = 0;
643f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
644f1af5d2fSBarry Smith 
645f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
646f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
647f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
648f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
649f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
650f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
651f1af5d2fSBarry Smith     v += 9;
652f1af5d2fSBarry Smith 
653f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
654f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
655f1af5d2fSBarry Smith     while (nz--) {
656f1af5d2fSBarry Smith       oidx = 3*(*vi++);
657f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
658f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
659f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
660f1af5d2fSBarry Smith       v  += 9;
661f1af5d2fSBarry Smith     }
662f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
663f1af5d2fSBarry Smith     idx += 3;
664f1af5d2fSBarry Smith   }
665f1af5d2fSBarry Smith   /* backward solve the L^T */
666f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
667f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
668f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
669f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
670f1af5d2fSBarry Smith     idt  = 3*i;
671f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
672f1af5d2fSBarry Smith     while (nz--) {
673f1af5d2fSBarry Smith       idx   = 3*(*vi--);
674f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
675f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
676f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
677f1af5d2fSBarry Smith       v -= 9;
678f1af5d2fSBarry Smith     }
679f1af5d2fSBarry Smith   }
680f1af5d2fSBarry Smith 
681f1af5d2fSBarry Smith   /* copy t into x according to permutation */
682f1af5d2fSBarry Smith   ii = 0;
683f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
684f1af5d2fSBarry Smith     ir      = 3*r[i];
685f1af5d2fSBarry Smith     x[ir]   = t[ii];
686f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
687f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
688f1af5d2fSBarry Smith     ii += 3;
689f1af5d2fSBarry Smith   }
690f1af5d2fSBarry Smith 
691f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
692f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6931ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6941ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
695dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
696f1af5d2fSBarry Smith   PetscFunctionReturn(0);
697f1af5d2fSBarry Smith }
698f1af5d2fSBarry Smith 
6994a2ae208SSatish Balay #undef __FUNCT__
7004a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
701dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
702f1af5d2fSBarry Smith {
703f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
704f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7056849ba73SBarry Smith   PetscErrorCode ierr;
7065d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7075d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
708690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
709f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
71087828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
71187828ca2SBarry Smith   PetscScalar    *x,*b,*t;
712f1af5d2fSBarry Smith 
713f1af5d2fSBarry Smith   PetscFunctionBegin;
7141ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
716f1af5d2fSBarry Smith   t  = a->solve_work;
717f1af5d2fSBarry Smith 
718f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
719f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
720f1af5d2fSBarry Smith 
721f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
722f1af5d2fSBarry Smith   ii = 0;
723f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
724f1af5d2fSBarry Smith     ic      = 4*c[i];
725f1af5d2fSBarry Smith     t[ii]   = b[ic];
726f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
727f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
728f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
729f1af5d2fSBarry Smith     ii += 4;
730f1af5d2fSBarry Smith   }
731f1af5d2fSBarry Smith 
732f1af5d2fSBarry Smith   /* forward solve the U^T */
733f1af5d2fSBarry Smith   idx = 0;
734f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
735f1af5d2fSBarry Smith 
736f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
737f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
738f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
739f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
740f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
741f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
742f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
743f1af5d2fSBarry Smith     v += 16;
744f1af5d2fSBarry Smith 
745f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
746f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
747f1af5d2fSBarry Smith     while (nz--) {
748f1af5d2fSBarry Smith       oidx = 4*(*vi++);
749f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
750f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
751f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
752f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
753f1af5d2fSBarry Smith       v  += 16;
754f1af5d2fSBarry Smith     }
755f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
756f1af5d2fSBarry Smith     idx += 4;
757f1af5d2fSBarry Smith   }
758f1af5d2fSBarry Smith   /* backward solve the L^T */
759f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
760f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
761f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
762f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
763f1af5d2fSBarry Smith     idt  = 4*i;
764f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
765f1af5d2fSBarry Smith     while (nz--) {
766f1af5d2fSBarry Smith       idx   = 4*(*vi--);
767f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
768f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
769f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
770f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
771f1af5d2fSBarry Smith       v -= 16;
772f1af5d2fSBarry Smith     }
773f1af5d2fSBarry Smith   }
774f1af5d2fSBarry Smith 
775f1af5d2fSBarry Smith   /* copy t into x according to permutation */
776f1af5d2fSBarry Smith   ii = 0;
777f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
778f1af5d2fSBarry Smith     ir      = 4*r[i];
779f1af5d2fSBarry Smith     x[ir]   = t[ii];
780f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
781f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
782f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
783f1af5d2fSBarry Smith     ii += 4;
784f1af5d2fSBarry Smith   }
785f1af5d2fSBarry Smith 
786f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
787f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7881ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
790dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
791f1af5d2fSBarry Smith   PetscFunctionReturn(0);
792f1af5d2fSBarry Smith }
793f1af5d2fSBarry Smith 
7944a2ae208SSatish Balay #undef __FUNCT__
7954a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
796dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
797f1af5d2fSBarry Smith {
798f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
799f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8006849ba73SBarry Smith   PetscErrorCode ierr;
8015d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8025d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
803690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
804f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
80587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
80687828ca2SBarry Smith   PetscScalar    *x,*b,*t;
807f1af5d2fSBarry Smith 
808f1af5d2fSBarry Smith   PetscFunctionBegin;
8091ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
811f1af5d2fSBarry Smith   t  = a->solve_work;
812f1af5d2fSBarry Smith 
813f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
814f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
815f1af5d2fSBarry Smith 
816f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
817f1af5d2fSBarry Smith   ii = 0;
818f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
819f1af5d2fSBarry Smith     ic      = 5*c[i];
820f1af5d2fSBarry Smith     t[ii]   = b[ic];
821f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
822f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
823f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
824f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
825f1af5d2fSBarry Smith     ii += 5;
826f1af5d2fSBarry Smith   }
827f1af5d2fSBarry Smith 
828f1af5d2fSBarry Smith   /* forward solve the U^T */
829f1af5d2fSBarry Smith   idx = 0;
830f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
831f1af5d2fSBarry Smith 
832f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
833f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
834f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
835f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
836f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
837f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
838f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
839f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
840f1af5d2fSBarry Smith     v += 25;
841f1af5d2fSBarry Smith 
842f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
843f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
844f1af5d2fSBarry Smith     while (nz--) {
845f1af5d2fSBarry Smith       oidx = 5*(*vi++);
846f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
847f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
848f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
849f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
850f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
851f1af5d2fSBarry Smith       v  += 25;
852f1af5d2fSBarry Smith     }
853f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
854f1af5d2fSBarry Smith     idx += 5;
855f1af5d2fSBarry Smith   }
856f1af5d2fSBarry Smith   /* backward solve the L^T */
857f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
858f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
859f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
860f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
861f1af5d2fSBarry Smith     idt  = 5*i;
862f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
863f1af5d2fSBarry Smith     while (nz--) {
864f1af5d2fSBarry Smith       idx   = 5*(*vi--);
865f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
866f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
867f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
868f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
869f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
870f1af5d2fSBarry Smith       v -= 25;
871f1af5d2fSBarry Smith     }
872f1af5d2fSBarry Smith   }
873f1af5d2fSBarry Smith 
874f1af5d2fSBarry Smith   /* copy t into x according to permutation */
875f1af5d2fSBarry Smith   ii = 0;
876f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
877f1af5d2fSBarry Smith     ir      = 5*r[i];
878f1af5d2fSBarry Smith     x[ir]   = t[ii];
879f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
880f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
881f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
882f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
883f1af5d2fSBarry Smith     ii += 5;
884f1af5d2fSBarry Smith   }
885f1af5d2fSBarry Smith 
886f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
887f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8881ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
890dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
891f1af5d2fSBarry Smith   PetscFunctionReturn(0);
892f1af5d2fSBarry Smith }
893f1af5d2fSBarry Smith 
8944a2ae208SSatish Balay #undef __FUNCT__
8954a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
896dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
897f1af5d2fSBarry Smith {
898f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
899f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9006849ba73SBarry Smith   PetscErrorCode ierr;
9015d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9025d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
903690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
904f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
90587828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
90687828ca2SBarry Smith   PetscScalar    *x,*b,*t;
907f1af5d2fSBarry Smith 
908f1af5d2fSBarry Smith   PetscFunctionBegin;
9091ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
911f1af5d2fSBarry Smith   t  = a->solve_work;
912f1af5d2fSBarry Smith 
913f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
914f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
915f1af5d2fSBarry Smith 
916f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
917f1af5d2fSBarry Smith   ii = 0;
918f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
919f1af5d2fSBarry Smith     ic      = 6*c[i];
920f1af5d2fSBarry Smith     t[ii]   = b[ic];
921f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
922f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
923f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
924f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
925f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
926f1af5d2fSBarry Smith     ii += 6;
927f1af5d2fSBarry Smith   }
928f1af5d2fSBarry Smith 
929f1af5d2fSBarry Smith   /* forward solve the U^T */
930f1af5d2fSBarry Smith   idx = 0;
931f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
932f1af5d2fSBarry Smith 
933f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
934f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
935f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
936f1af5d2fSBarry Smith     x6    = t[5+idx];
937f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
938f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
939f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
940f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
941f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
942f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
943f1af5d2fSBarry Smith     v += 36;
944f1af5d2fSBarry Smith 
945f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
946f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
947f1af5d2fSBarry Smith     while (nz--) {
948f1af5d2fSBarry Smith       oidx = 6*(*vi++);
949f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
950f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
951f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
952f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
953f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
954f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
955f1af5d2fSBarry Smith       v  += 36;
956f1af5d2fSBarry Smith     }
957f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
958f1af5d2fSBarry Smith     t[5+idx] = s6;
959f1af5d2fSBarry Smith     idx += 6;
960f1af5d2fSBarry Smith   }
961f1af5d2fSBarry Smith   /* backward solve the L^T */
962f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
963f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
964f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
965f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
966f1af5d2fSBarry Smith     idt  = 6*i;
967f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
968f1af5d2fSBarry Smith     s6 = t[5+idt];
969f1af5d2fSBarry Smith     while (nz--) {
970f1af5d2fSBarry Smith       idx   = 6*(*vi--);
971f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
972f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
973f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
974f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
975f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
976f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
977f1af5d2fSBarry Smith       v -= 36;
978f1af5d2fSBarry Smith     }
979f1af5d2fSBarry Smith   }
980f1af5d2fSBarry Smith 
981f1af5d2fSBarry Smith   /* copy t into x according to permutation */
982f1af5d2fSBarry Smith   ii = 0;
983f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
984f1af5d2fSBarry Smith     ir      = 6*r[i];
985f1af5d2fSBarry Smith     x[ir]   = t[ii];
986f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
987f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
988f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
989f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
990f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
991f1af5d2fSBarry Smith     ii += 6;
992f1af5d2fSBarry Smith   }
993f1af5d2fSBarry Smith 
994f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
995f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
998dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
999f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1000f1af5d2fSBarry Smith }
1001f1af5d2fSBarry Smith 
10024a2ae208SSatish Balay #undef __FUNCT__
10034a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1004dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1005f1af5d2fSBarry Smith {
1006f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1007f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10086849ba73SBarry Smith   PetscErrorCode ierr;
10095d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10105d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1011690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1012f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
101387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
101487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1015f1af5d2fSBarry Smith 
1016f1af5d2fSBarry Smith   PetscFunctionBegin;
10171ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1019f1af5d2fSBarry Smith   t  = a->solve_work;
1020f1af5d2fSBarry Smith 
1021f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1022f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1023f1af5d2fSBarry Smith 
1024f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1025f1af5d2fSBarry Smith   ii = 0;
1026f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1027f1af5d2fSBarry Smith     ic      = 7*c[i];
1028f1af5d2fSBarry Smith     t[ii]   = b[ic];
1029f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1030f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1031f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1032f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1033f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1034f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1035f1af5d2fSBarry Smith     ii += 7;
1036f1af5d2fSBarry Smith   }
1037f1af5d2fSBarry Smith 
1038f1af5d2fSBarry Smith   /* forward solve the U^T */
1039f1af5d2fSBarry Smith   idx = 0;
1040f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1041f1af5d2fSBarry Smith 
1042f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1043f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1044f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1045f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1046f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1047f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1048f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1049f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1050f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1051f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1052f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1053f1af5d2fSBarry Smith     v += 49;
1054f1af5d2fSBarry Smith 
1055f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1056f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1057f1af5d2fSBarry Smith     while (nz--) {
1058f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1059f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1060f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1061f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1062f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1063f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1064f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1065f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1066f1af5d2fSBarry Smith       v  += 49;
1067f1af5d2fSBarry Smith     }
1068f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1069f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1070f1af5d2fSBarry Smith     idx += 7;
1071f1af5d2fSBarry Smith   }
1072f1af5d2fSBarry Smith   /* backward solve the L^T */
1073f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1074f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1075f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1076f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1077f1af5d2fSBarry Smith     idt  = 7*i;
1078f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1079f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1080f1af5d2fSBarry Smith     while (nz--) {
1081f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1082f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1083f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1084f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1085f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1086f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1087f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1088f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1089f1af5d2fSBarry Smith       v -= 49;
1090f1af5d2fSBarry Smith     }
1091f1af5d2fSBarry Smith   }
1092f1af5d2fSBarry Smith 
1093f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1094f1af5d2fSBarry Smith   ii = 0;
1095f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1096f1af5d2fSBarry Smith     ir      = 7*r[i];
1097f1af5d2fSBarry Smith     x[ir]   = t[ii];
1098f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1099f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1100f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1101f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1102f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1103f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1104f1af5d2fSBarry Smith     ii += 7;
1105f1af5d2fSBarry Smith   }
1106f1af5d2fSBarry Smith 
1107f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1108f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11091ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11101ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1111dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1112f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1113f1af5d2fSBarry Smith }
1114f1af5d2fSBarry Smith 
11154e2b4712SSatish Balay /* ----------------------------------------------------------- */
11164a2ae208SSatish Balay #undef __FUNCT__
11174a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1118dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11194e2b4712SSatish Balay {
11204e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11214e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11226849ba73SBarry Smith   PetscErrorCode ierr;
11235d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11245d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11255d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11263f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
112787828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11284e2b4712SSatish Balay 
11294e2b4712SSatish Balay   PetscFunctionBegin;
11301ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1132f1af5d2fSBarry Smith   t  = a->solve_work;
11334e2b4712SSatish Balay 
11344e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11354e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11364e2b4712SSatish Balay 
11374e2b4712SSatish Balay   /* forward solve the lower triangular */
113887828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11394e2b4712SSatish Balay   for (i=1; i<n; i++) {
11404e2b4712SSatish Balay     v   = aa + bs2*ai[i];
11414e2b4712SSatish Balay     vi  = aj + ai[i];
11424e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1143f1af5d2fSBarry Smith     s = t + bs*i;
114487828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11454e2b4712SSatish Balay     while (nz--) {
1146f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
11474e2b4712SSatish Balay       v += bs2;
11484e2b4712SSatish Balay     }
11494e2b4712SSatish Balay   }
11504e2b4712SSatish Balay   /* backward solve the upper triangular */
1151d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
11524e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
11534e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
11544e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
11554e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
115687828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11574e2b4712SSatish Balay     while (nz--) {
1158f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
11594e2b4712SSatish Balay       v += bs2;
11604e2b4712SSatish Balay     }
1161f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
116287828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
11634e2b4712SSatish Balay   }
11644e2b4712SSatish Balay 
11654e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
11664e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11671ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1169dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
11704e2b4712SSatish Balay   PetscFunctionReturn(0);
11714e2b4712SSatish Balay }
11724e2b4712SSatish Balay 
11734a2ae208SSatish Balay #undef __FUNCT__
11744a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1175dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
11764e2b4712SSatish Balay {
11774e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11784e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11796849ba73SBarry Smith   PetscErrorCode ierr;
11805d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
11815d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
11823f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118387828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
118487828ca2SBarry Smith   PetscScalar    *x,*b,*t;
11854e2b4712SSatish Balay 
11864e2b4712SSatish Balay   PetscFunctionBegin;
11871ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1189f1af5d2fSBarry Smith   t  = a->solve_work;
11904e2b4712SSatish Balay 
11914e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11924e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11934e2b4712SSatish Balay 
11944e2b4712SSatish Balay   /* forward solve the lower triangular */
11954e2b4712SSatish Balay   idx    = 7*(*r++);
1196f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1197f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1198f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
11994e2b4712SSatish Balay 
12004e2b4712SSatish Balay   for (i=1; i<n; i++) {
12014e2b4712SSatish Balay     v     = aa + 49*ai[i];
12024e2b4712SSatish Balay     vi    = aj + ai[i];
12034e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12044e2b4712SSatish Balay     idx   = 7*(*r++);
1205f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1206f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12074e2b4712SSatish Balay     while (nz--) {
12084e2b4712SSatish Balay       idx   = 7*(*vi++);
1209f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1210f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1211f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1212f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1213f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1214f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1215f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1216f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1217f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1218f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12194e2b4712SSatish Balay       v += 49;
12204e2b4712SSatish Balay     }
12214e2b4712SSatish Balay     idx = 7*i;
1222f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1223f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1224f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12254e2b4712SSatish Balay   }
12264e2b4712SSatish Balay   /* backward solve the upper triangular */
12274e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12284e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12294e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12304e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12314e2b4712SSatish Balay     idt  = 7*i;
1232f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1233f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1234f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12354e2b4712SSatish Balay     while (nz--) {
12364e2b4712SSatish Balay       idx   = 7*(*vi++);
1237f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1238f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1239f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1240f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1241f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1242f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1243f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1244f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1245f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1246f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12474e2b4712SSatish Balay       v += 49;
12484e2b4712SSatish Balay     }
12494e2b4712SSatish Balay     idc = 7*(*c--);
12504e2b4712SSatish Balay     v   = aa + 49*diag[i];
1251f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1252f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1253f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1254f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1255f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1256f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1257f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1258f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1259f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1260f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1261f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1262f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1263f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1264f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
12654e2b4712SSatish Balay   }
12664e2b4712SSatish Balay 
12674e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12684e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12691ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1271dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
12724e2b4712SSatish Balay   PetscFunctionReturn(0);
12734e2b4712SSatish Balay }
12744e2b4712SSatish Balay 
12754a2ae208SSatish Balay #undef __FUNCT__
12764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1277dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
127815091d37SBarry Smith {
127915091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1280690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1281dfbe8321SBarry Smith   PetscErrorCode    ierr;
1282690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1283d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1284d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1285d9fead3dSBarry Smith   const PetscScalar *b;
128615091d37SBarry Smith 
128715091d37SBarry Smith   PetscFunctionBegin;
1288d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
129015091d37SBarry Smith   /* forward solve the lower triangular */
129115091d37SBarry Smith   idx    = 0;
129215091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
129315091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
129415091d37SBarry Smith   x[6] = b[6+idx];
129515091d37SBarry Smith   for (i=1; i<n; i++) {
129615091d37SBarry Smith     v     =  aa + 49*ai[i];
129715091d37SBarry Smith     vi    =  aj + ai[i];
129815091d37SBarry Smith     nz    =  diag[i] - ai[i];
129915091d37SBarry Smith     idx   =  7*i;
1300f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1301f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1302f1af5d2fSBarry Smith     s7  =  b[6+idx];
130315091d37SBarry Smith     while (nz--) {
130415091d37SBarry Smith       jdx   = 7*(*vi++);
130515091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
130615091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
130715091d37SBarry Smith       x7    = x[6+jdx];
1308f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1309f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1310f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1311f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1312f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1313f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1314f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
131515091d37SBarry Smith       v += 49;
131615091d37SBarry Smith      }
1317f1af5d2fSBarry Smith     x[idx]   = s1;
1318f1af5d2fSBarry Smith     x[1+idx] = s2;
1319f1af5d2fSBarry Smith     x[2+idx] = s3;
1320f1af5d2fSBarry Smith     x[3+idx] = s4;
1321f1af5d2fSBarry Smith     x[4+idx] = s5;
1322f1af5d2fSBarry Smith     x[5+idx] = s6;
1323f1af5d2fSBarry Smith     x[6+idx] = s7;
132415091d37SBarry Smith   }
132515091d37SBarry Smith   /* backward solve the upper triangular */
132615091d37SBarry Smith   for (i=n-1; i>=0; i--){
132715091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
132815091d37SBarry Smith     vi   = aj + diag[i] + 1;
132915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
133015091d37SBarry Smith     idt  = 7*i;
1331f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1332f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1333f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1334f1af5d2fSBarry Smith     s7 = x[6+idt];
133515091d37SBarry Smith     while (nz--) {
133615091d37SBarry Smith       idx   = 7*(*vi++);
133715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
133815091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
133915091d37SBarry Smith       x7    = x[6+idx];
1340f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1341f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1342f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1343f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1344f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1345f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1346f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
134715091d37SBarry Smith       v += 49;
134815091d37SBarry Smith     }
134915091d37SBarry Smith     v        = aa + 49*diag[i];
1350f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1351f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1352f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1353f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1354f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1355f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1356f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1357f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1358f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1359f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1360f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1361f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1362f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1363f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
136415091d37SBarry Smith   }
136515091d37SBarry Smith 
1366d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13671ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1368dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
136915091d37SBarry Smith   PetscFunctionReturn(0);
137015091d37SBarry Smith }
137115091d37SBarry Smith 
13724a2ae208SSatish Balay #undef __FUNCT__
13734a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1374dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
137515091d37SBarry Smith {
137615091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
137715091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
13786849ba73SBarry Smith   PetscErrorCode    ierr;
13795d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
13805d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1381d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1382d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1383d9fead3dSBarry Smith   const PetscScalar *b;
138415091d37SBarry Smith   PetscFunctionBegin;
1385d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13861ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1387f1af5d2fSBarry Smith   t  = a->solve_work;
138815091d37SBarry Smith 
138915091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
139015091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
139115091d37SBarry Smith 
139215091d37SBarry Smith   /* forward solve the lower triangular */
139315091d37SBarry Smith   idx    = 6*(*r++);
1394f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1395f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1396f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
139715091d37SBarry Smith   for (i=1; i<n; i++) {
139815091d37SBarry Smith     v     = aa + 36*ai[i];
139915091d37SBarry Smith     vi    = aj + ai[i];
140015091d37SBarry Smith     nz    = diag[i] - ai[i];
140115091d37SBarry Smith     idx   = 6*(*r++);
1402f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1403f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
140415091d37SBarry Smith     while (nz--) {
140515091d37SBarry Smith       idx   = 6*(*vi++);
1406f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1407f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1408f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1409f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1410f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1411f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1412f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1413f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
141415091d37SBarry Smith       v += 36;
141515091d37SBarry Smith     }
141615091d37SBarry Smith     idx = 6*i;
1417f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1418f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1419f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
142015091d37SBarry Smith   }
142115091d37SBarry Smith   /* backward solve the upper triangular */
142215091d37SBarry Smith   for (i=n-1; i>=0; i--){
142315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
142415091d37SBarry Smith     vi   = aj + diag[i] + 1;
142515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
142615091d37SBarry Smith     idt  = 6*i;
1427f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1428f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1429f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
143015091d37SBarry Smith     while (nz--) {
143115091d37SBarry Smith       idx   = 6*(*vi++);
1432f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1433f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1434f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1435f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1436f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1437f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1438f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1439f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1440f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
144115091d37SBarry Smith       v += 36;
144215091d37SBarry Smith     }
144315091d37SBarry Smith     idc = 6*(*c--);
144415091d37SBarry Smith     v   = aa + 36*diag[i];
1445f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1446f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1447f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1448f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1449f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1450f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1451f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1452f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1453f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1454f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1455f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1456f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
145715091d37SBarry Smith   }
145815091d37SBarry Smith 
145915091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
146015091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1461d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14621ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1463dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
146415091d37SBarry Smith   PetscFunctionReturn(0);
146515091d37SBarry Smith }
146615091d37SBarry Smith 
14674a2ae208SSatish Balay #undef __FUNCT__
14684a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1469dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
147015091d37SBarry Smith {
147115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1472690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1473dfbe8321SBarry Smith   PetscErrorCode    ierr;
1474690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1475d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1476d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1477d9fead3dSBarry Smith   const PetscScalar *b;
147815091d37SBarry Smith 
147915091d37SBarry Smith   PetscFunctionBegin;
1480d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14811ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
148215091d37SBarry Smith   /* forward solve the lower triangular */
148315091d37SBarry Smith   idx    = 0;
148415091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
148515091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
148615091d37SBarry Smith   for (i=1; i<n; i++) {
148715091d37SBarry Smith     v     =  aa + 36*ai[i];
148815091d37SBarry Smith     vi    =  aj + ai[i];
148915091d37SBarry Smith     nz    =  diag[i] - ai[i];
149015091d37SBarry Smith     idx   =  6*i;
1491f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1492f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
149315091d37SBarry Smith     while (nz--) {
149415091d37SBarry Smith       jdx   = 6*(*vi++);
149515091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
149615091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1497f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1498f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1499f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1500f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1501f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1502f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150315091d37SBarry Smith       v += 36;
150415091d37SBarry Smith      }
1505f1af5d2fSBarry Smith     x[idx]   = s1;
1506f1af5d2fSBarry Smith     x[1+idx] = s2;
1507f1af5d2fSBarry Smith     x[2+idx] = s3;
1508f1af5d2fSBarry Smith     x[3+idx] = s4;
1509f1af5d2fSBarry Smith     x[4+idx] = s5;
1510f1af5d2fSBarry Smith     x[5+idx] = s6;
151115091d37SBarry Smith   }
151215091d37SBarry Smith   /* backward solve the upper triangular */
151315091d37SBarry Smith   for (i=n-1; i>=0; i--){
151415091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
151515091d37SBarry Smith     vi   = aj + diag[i] + 1;
151615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
151715091d37SBarry Smith     idt  = 6*i;
1518f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1519f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1520f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
152115091d37SBarry Smith     while (nz--) {
152215091d37SBarry Smith       idx   = 6*(*vi++);
152315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
152415091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1525f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1526f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1527f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1528f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1529f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1530f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
153115091d37SBarry Smith       v += 36;
153215091d37SBarry Smith     }
153315091d37SBarry Smith     v        = aa + 36*diag[i];
1534f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1535f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1536f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1537f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1538f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1539f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
154015091d37SBarry Smith   }
154115091d37SBarry Smith 
1542d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
154515091d37SBarry Smith   PetscFunctionReturn(0);
154615091d37SBarry Smith }
154715091d37SBarry Smith 
15484a2ae208SSatish Balay #undef __FUNCT__
15494a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1550dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
15514e2b4712SSatish Balay {
15524e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
15534e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
15546849ba73SBarry Smith   PetscErrorCode    ierr;
15555d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
15565d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1557d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1558d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1559d9fead3dSBarry Smith   const PetscScalar *b;
15604e2b4712SSatish Balay 
15614e2b4712SSatish Balay   PetscFunctionBegin;
1562d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15631ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1564f1af5d2fSBarry Smith   t  = a->solve_work;
15654e2b4712SSatish Balay 
15664e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15674e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15684e2b4712SSatish Balay 
15694e2b4712SSatish Balay   /* forward solve the lower triangular */
15704e2b4712SSatish Balay   idx    = 5*(*r++);
1571f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1572f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
15734e2b4712SSatish Balay   for (i=1; i<n; i++) {
15744e2b4712SSatish Balay     v     = aa + 25*ai[i];
15754e2b4712SSatish Balay     vi    = aj + ai[i];
15764e2b4712SSatish Balay     nz    = diag[i] - ai[i];
15774e2b4712SSatish Balay     idx   = 5*(*r++);
1578f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1579f1af5d2fSBarry Smith     s5  = b[4+idx];
15804e2b4712SSatish Balay     while (nz--) {
15814e2b4712SSatish Balay       idx   = 5*(*vi++);
1582f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1583f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1584f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1585f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1586f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1587f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1588f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
15894e2b4712SSatish Balay       v += 25;
15904e2b4712SSatish Balay     }
15914e2b4712SSatish Balay     idx = 5*i;
1592f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1593f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
15944e2b4712SSatish Balay   }
15954e2b4712SSatish Balay   /* backward solve the upper triangular */
15964e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15974e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
15984e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
15994e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
16004e2b4712SSatish Balay     idt  = 5*i;
1601f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1602f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
16034e2b4712SSatish Balay     while (nz--) {
16044e2b4712SSatish Balay       idx   = 5*(*vi++);
1605f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1606f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1607f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1608f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1609f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1610f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1611f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
16124e2b4712SSatish Balay       v += 25;
16134e2b4712SSatish Balay     }
16144e2b4712SSatish Balay     idc = 5*(*c--);
16154e2b4712SSatish Balay     v   = aa + 25*diag[i];
1616f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1617f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1618f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1619f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1620f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1621f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1622f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1623f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1624f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1625f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
16264e2b4712SSatish Balay   }
16274e2b4712SSatish Balay 
16284e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
16294e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1630d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16311ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1632dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
16334e2b4712SSatish Balay   PetscFunctionReturn(0);
16344e2b4712SSatish Balay }
16354e2b4712SSatish Balay 
163684a281e5SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
163784a281e5SHong Zhang {
163884a281e5SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
163984a281e5SHong Zhang   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
164084a281e5SHong Zhang   PetscErrorCode    ierr;
164184a281e5SHong Zhang   PetscInt          jdx;
164284a281e5SHong Zhang   const MatScalar   *aa=a->a,*v;
164384a281e5SHong Zhang   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
164484a281e5SHong Zhang   const PetscScalar *b;
164584a281e5SHong Zhang 
164684a281e5SHong Zhang   PetscFunctionBegin;
164784a281e5SHong Zhang   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
164884a281e5SHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
164984a281e5SHong Zhang   /* forward solve the lower triangular */
165084a281e5SHong Zhang   idx    = 0;
165184a281e5SHong Zhang   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
165284a281e5SHong Zhang   for (i=1; i<n; i++) {
165384a281e5SHong Zhang     v   = aa + 25*ai[i];
165484a281e5SHong Zhang     vi  = aj + ai[i];
165584a281e5SHong Zhang     nz  = ai[i+1] - ai[i];
165684a281e5SHong Zhang     idx = 5*i;
165784a281e5SHong Zhang     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
165884a281e5SHong Zhang     while (nz--) {
165984a281e5SHong Zhang       jdx   = 5*(*vi++);
166084a281e5SHong Zhang       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
166184a281e5SHong Zhang       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
166284a281e5SHong Zhang       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
166384a281e5SHong Zhang       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
166484a281e5SHong Zhang       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
166584a281e5SHong Zhang       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
166684a281e5SHong Zhang       v    += 25;
166784a281e5SHong Zhang     }
166884a281e5SHong Zhang     x[idx]   = s1;
166984a281e5SHong Zhang     x[1+idx] = s2;
167084a281e5SHong Zhang     x[2+idx] = s3;
167184a281e5SHong Zhang     x[3+idx] = s4;
167284a281e5SHong Zhang     x[4+idx] = s5;
167384a281e5SHong Zhang   }
167484a281e5SHong Zhang 
167584a281e5SHong Zhang   /* backward solve the upper triangular */
167684a281e5SHong Zhang   for (i=n-1; i>=0; i--){
167784a281e5SHong Zhang     v   = aa + 25*ai[2*n-i];
167884a281e5SHong Zhang     vi  = aj + ai[2*n-i];
167984a281e5SHong Zhang     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
168084a281e5SHong Zhang     idt = 5*i;
168184a281e5SHong Zhang     s1 = x[idt];  s2 = x[1+idt];
168284a281e5SHong Zhang     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
168384a281e5SHong Zhang     while (nz--) {
168484a281e5SHong Zhang       idx   = 5*(*vi++);
168584a281e5SHong Zhang       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
168684a281e5SHong Zhang       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
168784a281e5SHong Zhang       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
168884a281e5SHong Zhang       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
168984a281e5SHong Zhang       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
169084a281e5SHong Zhang       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
169184a281e5SHong Zhang       v    += 25;
169284a281e5SHong Zhang     }
169384a281e5SHong Zhang     /* x = inv_diagonal*x */
169484a281e5SHong Zhang     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
169584a281e5SHong Zhang     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
169684a281e5SHong Zhang     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
169784a281e5SHong Zhang     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
169884a281e5SHong Zhang     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
169984a281e5SHong Zhang   }
170084a281e5SHong Zhang 
170184a281e5SHong Zhang   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
170284a281e5SHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
170384a281e5SHong Zhang   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
170484a281e5SHong Zhang   PetscFunctionReturn(0);
170584a281e5SHong Zhang }
170684a281e5SHong Zhang 
17074a2ae208SSatish Balay #undef __FUNCT__
17084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1709dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
171015091d37SBarry Smith {
171115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1712690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1713dfbe8321SBarry Smith   PetscErrorCode    ierr;
1714690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1715d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1716d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1717d9fead3dSBarry Smith   const PetscScalar *b;
171815091d37SBarry Smith 
171915091d37SBarry Smith   PetscFunctionBegin;
1720d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
172215091d37SBarry Smith   /* forward solve the lower triangular */
172315091d37SBarry Smith   idx    = 0;
172415091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
172515091d37SBarry Smith   for (i=1; i<n; i++) {
172615091d37SBarry Smith     v     =  aa + 25*ai[i];
172715091d37SBarry Smith     vi    =  aj + ai[i];
172815091d37SBarry Smith     nz    =  diag[i] - ai[i];
172915091d37SBarry Smith     idx   =  5*i;
1730f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
173115091d37SBarry Smith     while (nz--) {
173215091d37SBarry Smith       jdx   = 5*(*vi++);
173315091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1734f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1735f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1736f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1737f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1738f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
173915091d37SBarry Smith       v    += 25;
174015091d37SBarry Smith     }
1741f1af5d2fSBarry Smith     x[idx]   = s1;
1742f1af5d2fSBarry Smith     x[1+idx] = s2;
1743f1af5d2fSBarry Smith     x[2+idx] = s3;
1744f1af5d2fSBarry Smith     x[3+idx] = s4;
1745f1af5d2fSBarry Smith     x[4+idx] = s5;
174615091d37SBarry Smith   }
174715091d37SBarry Smith   /* backward solve the upper triangular */
174815091d37SBarry Smith   for (i=n-1; i>=0; i--){
174915091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
175015091d37SBarry Smith     vi   = aj + diag[i] + 1;
175115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
175215091d37SBarry Smith     idt  = 5*i;
1753f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1754f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
175515091d37SBarry Smith     while (nz--) {
175615091d37SBarry Smith       idx   = 5*(*vi++);
175715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1758f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1759f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1760f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1761f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1762f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
176315091d37SBarry Smith       v    += 25;
176415091d37SBarry Smith     }
176515091d37SBarry Smith     v        = aa + 25*diag[i];
1766f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1767f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1768f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1769f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1770f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
177115091d37SBarry Smith   }
177215091d37SBarry Smith 
1773d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1775dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
177615091d37SBarry Smith   PetscFunctionReturn(0);
177715091d37SBarry Smith }
177815091d37SBarry Smith 
17794a2ae208SSatish Balay #undef __FUNCT__
17804a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1781dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
17824e2b4712SSatish Balay {
17834e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
17844e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
17856849ba73SBarry Smith   PetscErrorCode    ierr;
17865d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
17875d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1788d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1789d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1790d9fead3dSBarry Smith   const PetscScalar *b;
17914e2b4712SSatish Balay 
17924e2b4712SSatish Balay   PetscFunctionBegin;
1793d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1795f1af5d2fSBarry Smith   t  = a->solve_work;
17964e2b4712SSatish Balay 
17974e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17984e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
17994e2b4712SSatish Balay 
18004e2b4712SSatish Balay   /* forward solve the lower triangular */
18014e2b4712SSatish Balay   idx    = 4*(*r++);
1802f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1803f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
18044e2b4712SSatish Balay   for (i=1; i<n; i++) {
18054e2b4712SSatish Balay     v     = aa + 16*ai[i];
18064e2b4712SSatish Balay     vi    = aj + ai[i];
18074e2b4712SSatish Balay     nz    = diag[i] - ai[i];
18084e2b4712SSatish Balay     idx   = 4*(*r++);
1809f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
18104e2b4712SSatish Balay     while (nz--) {
18114e2b4712SSatish Balay       idx   = 4*(*vi++);
1812f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1813f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1814f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1815f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1816f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
18174e2b4712SSatish Balay       v    += 16;
18184e2b4712SSatish Balay     }
18194e2b4712SSatish Balay     idx        = 4*i;
1820f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1821f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
18224e2b4712SSatish Balay   }
18234e2b4712SSatish Balay   /* backward solve the upper triangular */
18244e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
18254e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
18264e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
18274e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
18284e2b4712SSatish Balay     idt  = 4*i;
1829f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1830f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
18314e2b4712SSatish Balay     while (nz--) {
18324e2b4712SSatish Balay       idx   = 4*(*vi++);
1833f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1834f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1835f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1836f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1837f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1838f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
18394e2b4712SSatish Balay       v += 16;
18404e2b4712SSatish Balay     }
18414e2b4712SSatish Balay     idc      = 4*(*c--);
18424e2b4712SSatish Balay     v        = aa + 16*diag[i];
1843f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1844f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1845f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1846f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
18474e2b4712SSatish Balay   }
18484e2b4712SSatish Balay 
18494e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18504e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1851d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18521ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1853dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
18544e2b4712SSatish Balay   PetscFunctionReturn(0);
18554e2b4712SSatish Balay }
1856f26ec98cSKris Buschelman 
1857f26ec98cSKris Buschelman #undef __FUNCT__
1858f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1859dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1860f26ec98cSKris Buschelman {
1861f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1862f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
18636849ba73SBarry Smith   PetscErrorCode    ierr;
18645d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
18655d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1866d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1867d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1868d9fead3dSBarry Smith   PetscScalar       *x;
1869d9fead3dSBarry Smith   const PetscScalar *b;
1870f26ec98cSKris Buschelman 
1871f26ec98cSKris Buschelman   PetscFunctionBegin;
1872d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1874f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1875f26ec98cSKris Buschelman 
1876f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1877f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1878f26ec98cSKris Buschelman 
1879f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1880f26ec98cSKris Buschelman   idx    = 4*(*r++);
1881f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1882f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1883f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1884f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1885f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1886f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1887f26ec98cSKris Buschelman     vi    = aj + ai[i];
1888f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1889f26ec98cSKris Buschelman     idx   = 4*(*r++);
1890f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1891f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1892f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1893f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1894f26ec98cSKris Buschelman     while (nz--) {
1895f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1896f26ec98cSKris Buschelman       x1  = t[idx];
1897f26ec98cSKris Buschelman       x2  = t[1+idx];
1898f26ec98cSKris Buschelman       x3  = t[2+idx];
1899f26ec98cSKris Buschelman       x4  = t[3+idx];
1900f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1901f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1902f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1903f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1904f26ec98cSKris Buschelman       v    += 16;
1905f26ec98cSKris Buschelman     }
1906f26ec98cSKris Buschelman     idx        = 4*i;
1907f26ec98cSKris Buschelman     t[idx]   = s1;
1908f26ec98cSKris Buschelman     t[1+idx] = s2;
1909f26ec98cSKris Buschelman     t[2+idx] = s3;
1910f26ec98cSKris Buschelman     t[3+idx] = s4;
1911f26ec98cSKris Buschelman   }
1912f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1913f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1914f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1915f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1916f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1917f26ec98cSKris Buschelman     idt  = 4*i;
1918f26ec98cSKris Buschelman     s1 = t[idt];
1919f26ec98cSKris Buschelman     s2 = t[1+idt];
1920f26ec98cSKris Buschelman     s3 = t[2+idt];
1921f26ec98cSKris Buschelman     s4 = t[3+idt];
1922f26ec98cSKris Buschelman     while (nz--) {
1923f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1924f26ec98cSKris Buschelman       x1  = t[idx];
1925f26ec98cSKris Buschelman       x2  = t[1+idx];
1926f26ec98cSKris Buschelman       x3  = t[2+idx];
1927f26ec98cSKris Buschelman       x4  = t[3+idx];
1928f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1929f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1930f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1931f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1932f26ec98cSKris Buschelman       v += 16;
1933f26ec98cSKris Buschelman     }
1934f26ec98cSKris Buschelman     idc      = 4*(*c--);
1935f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1936f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1937f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1938f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1939f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1940f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
1941f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
1942f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
1943f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
1944f26ec98cSKris Buschelman  }
1945f26ec98cSKris Buschelman 
1946f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1947f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1948d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1950dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1951f26ec98cSKris Buschelman   PetscFunctionReturn(0);
1952f26ec98cSKris Buschelman }
1953f26ec98cSKris Buschelman 
195424c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
195524c233c2SKris Buschelman 
195624c233c2SKris Buschelman #include PETSC_HAVE_SSE
195724c233c2SKris Buschelman 
195824c233c2SKris Buschelman #undef __FUNCT__
195924c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
1960dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
196124c233c2SKris Buschelman {
196224c233c2SKris Buschelman   /*
196324c233c2SKris Buschelman      Note: This code uses demotion of double
196424c233c2SKris Buschelman      to float when performing the mixed-mode computation.
196524c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
196624c233c2SKris Buschelman   */
196724c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
196824c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
19696849ba73SBarry Smith   PetscErrorCode ierr;
19705d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
19715d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
197224c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
197387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
197424c233c2SKris Buschelman 
197524c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
197624c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
197724c233c2SKris Buschelman   unsigned long   offset;
197824c233c2SKris Buschelman 
197924c233c2SKris Buschelman   PetscFunctionBegin;
198024c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
198124c233c2SKris Buschelman 
198224c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
198324c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
198424c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
198524c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
198624c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
198724c233c2SKris Buschelman 
19881ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
19891ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
199024c233c2SKris Buschelman     t  = a->solve_work;
199124c233c2SKris Buschelman 
199224c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
199324c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
199424c233c2SKris Buschelman 
199524c233c2SKris Buschelman     /* forward solve the lower triangular */
199624c233c2SKris Buschelman     idx  = 4*(*r++);
199724c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
199824c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
199924c233c2SKris Buschelman     v    =  aa + 16*ai[1];
200024c233c2SKris Buschelman 
200124c233c2SKris Buschelman     for (i=1; i<n;) {
200224c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
200324c233c2SKris Buschelman       vi   =  aj      + ai[i];
200424c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
200524c233c2SKris Buschelman       idx  =  4*(*r++);
200624c233c2SKris Buschelman 
200724c233c2SKris Buschelman       /* Demote sum from double to float */
200824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
200924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
201024c233c2SKris Buschelman 
201124c233c2SKris Buschelman       while (nz--) {
201224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
201324c233c2SKris Buschelman         idx = 4*(*vi++);
201424c233c2SKris Buschelman 
201524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
201624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
201724c233c2SKris Buschelman 
201824c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
201924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
202024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
202124c233c2SKris Buschelman 
202224c233c2SKris Buschelman           /* First Column */
202324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
202424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
202524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
202624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
202724c233c2SKris Buschelman 
202824c233c2SKris Buschelman           /* Second Column */
202924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
203024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
203124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
203224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
203324c233c2SKris Buschelman 
203424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
203524c233c2SKris Buschelman 
203624c233c2SKris Buschelman           /* Third Column */
203724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
203824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
203924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
204024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
204124c233c2SKris Buschelman 
204224c233c2SKris Buschelman           /* Fourth Column */
204324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
204424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
204524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
204624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
204724c233c2SKris Buschelman         SSE_INLINE_END_2
204824c233c2SKris Buschelman 
204924c233c2SKris Buschelman         v  += 16;
205024c233c2SKris Buschelman       }
205124c233c2SKris Buschelman       idx = 4*i;
205224c233c2SKris Buschelman       v   = aa + 16*ai[++i];
205324c233c2SKris Buschelman       PREFETCH_NTA(v);
205424c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
205524c233c2SKris Buschelman 
205624c233c2SKris Buschelman       /* Promote result from float to double */
205724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
205824c233c2SKris Buschelman     }
205924c233c2SKris Buschelman     /* backward solve the upper triangular */
206024c233c2SKris Buschelman     idt  = 4*(n-1);
206124c233c2SKris Buschelman     ai16 = 16*diag[n-1];
206224c233c2SKris Buschelman     v    = aa + ai16 + 16;
206324c233c2SKris Buschelman     for (i=n-1; i>=0;){
206424c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
206524c233c2SKris Buschelman       vi = aj + diag[i] + 1;
206624c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
206724c233c2SKris Buschelman 
206824c233c2SKris Buschelman       /* Demote accumulator from double to float */
206924c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
207024c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
207124c233c2SKris Buschelman 
207224c233c2SKris Buschelman       while (nz--) {
207324c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
207424c233c2SKris Buschelman         idx = 4*(*vi++);
207524c233c2SKris Buschelman 
207624c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
207724c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
207824c233c2SKris Buschelman 
207924c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
208024c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
208124c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
208224c233c2SKris Buschelman 
208324c233c2SKris Buschelman           /* First Column */
208424c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
208524c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
208624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
208724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
208824c233c2SKris Buschelman 
208924c233c2SKris Buschelman           /* Second Column */
209024c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
209124c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
209224c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
209324c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
209424c233c2SKris Buschelman 
209524c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
209624c233c2SKris Buschelman 
209724c233c2SKris Buschelman           /* Third Column */
209824c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
209924c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
210024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
210124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
210224c233c2SKris Buschelman 
210324c233c2SKris Buschelman           /* Fourth Column */
210424c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
210524c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
210624c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
210724c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
210824c233c2SKris Buschelman         SSE_INLINE_END_2
210924c233c2SKris Buschelman         v  += 16;
211024c233c2SKris Buschelman       }
211124c233c2SKris Buschelman       v    = aa + ai16;
211224c233c2SKris Buschelman       ai16 = 16*diag[--i];
211324c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
211424c233c2SKris Buschelman       /*
211524c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
211624c233c2SKris Buschelman          which was inverted as part of the factorization
211724c233c2SKris Buschelman       */
211824c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
211924c233c2SKris Buschelman         /* First Column */
212024c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
212124c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
212224c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
212324c233c2SKris Buschelman 
212424c233c2SKris Buschelman         /* Second Column */
212524c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
212624c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
212724c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
212824c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
212924c233c2SKris Buschelman 
213024c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
213124c233c2SKris Buschelman 
213224c233c2SKris Buschelman         /* Third Column */
213324c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
213424c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
213524c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
213624c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
213724c233c2SKris Buschelman 
213824c233c2SKris Buschelman         /* Fourth Column */
213924c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
214024c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
214124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
214224c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
214324c233c2SKris Buschelman 
214424c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
214524c233c2SKris Buschelman       SSE_INLINE_END_3
214624c233c2SKris Buschelman 
214724c233c2SKris Buschelman       /* Promote solution from float to double */
214824c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
214924c233c2SKris Buschelman 
215024c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
215124c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
215224c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
215324c233c2SKris Buschelman       idc  = 4*(*c--);
215424c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
215524c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
215624c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
215724c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
215824c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
215924c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
216024c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
216124c233c2SKris Buschelman       SSE_INLINE_END_2
216224c233c2SKris Buschelman       v    = aa + ai16 + 16;
216324c233c2SKris Buschelman       idt -= 4;
216424c233c2SKris Buschelman     }
216524c233c2SKris Buschelman 
216624c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
216724c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21681ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
21691ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2170dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
217124c233c2SKris Buschelman   SSE_SCOPE_END;
217224c233c2SKris Buschelman   PetscFunctionReturn(0);
217324c233c2SKris Buschelman }
217424c233c2SKris Buschelman 
217524c233c2SKris Buschelman #endif
21760ef38995SBarry Smith 
21770ef38995SBarry Smith 
21784e2b4712SSatish Balay /*
21794e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
21804e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
21814e2b4712SSatish Balay */
21824a2ae208SSatish Balay #undef __FUNCT__
21834a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2184dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
21854e2b4712SSatish Balay {
21864e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2187356650c2SBarry Smith   PetscInt          n=a->mbs;
2188356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2189dfbe8321SBarry Smith   PetscErrorCode    ierr;
2190356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2191d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2192d9fead3dSBarry Smith   PetscScalar       *x;
2193d9fead3dSBarry Smith   const PetscScalar *b;
21944e2b4712SSatish Balay 
21954e2b4712SSatish Balay   PetscFunctionBegin;
2196d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21971ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21984e2b4712SSatish Balay 
2199aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
22002853dc0eSBarry Smith   {
220187828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
22022853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
22032853dc0eSBarry Smith   }
2204aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
22052853dc0eSBarry Smith   {
220687828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
22072853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
22082853dc0eSBarry Smith   }
2209aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
22102853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2211e1293385SBarry Smith #else
221230d4dcafSBarry Smith   {
221387828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2214d9fead3dSBarry Smith     const MatScalar *v;
2215356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2216356650c2SBarry Smith     const PetscInt  *vi;
2217e1293385SBarry Smith 
22184e2b4712SSatish Balay   /* forward solve the lower triangular */
22194e2b4712SSatish Balay   idx    = 0;
2220e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
22214e2b4712SSatish Balay   for (i=1; i<n; i++) {
22224e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
22234e2b4712SSatish Balay     vi    =  aj      + ai[i];
22244e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2225e1293385SBarry Smith     idx   +=  4;
2226f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
22274e2b4712SSatish Balay     while (nz--) {
22284e2b4712SSatish Balay       jdx   = 4*(*vi++);
22294e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2230f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2231f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2232f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2233f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
22344e2b4712SSatish Balay       v    += 16;
22354e2b4712SSatish Balay     }
2236f1af5d2fSBarry Smith     x[idx]   = s1;
2237f1af5d2fSBarry Smith     x[1+idx] = s2;
2238f1af5d2fSBarry Smith     x[2+idx] = s3;
2239f1af5d2fSBarry Smith     x[3+idx] = s4;
22404e2b4712SSatish Balay   }
22414e2b4712SSatish Balay   /* backward solve the upper triangular */
22424e555682SBarry Smith   idt = 4*(n-1);
22434e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
22444e555682SBarry Smith     ai16 = 16*diag[i];
22454e555682SBarry Smith     v    = aa + ai16 + 16;
22464e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
22474e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2248f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2249f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
22504e2b4712SSatish Balay     while (nz--) {
22514e2b4712SSatish Balay       idx   = 4*(*vi++);
22524e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2253f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2254f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2255f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2256f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
22574e2b4712SSatish Balay       v    += 16;
22584e2b4712SSatish Balay     }
22594e555682SBarry Smith     v        = aa + ai16;
2260f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2261f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2262f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2263f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2264329f5518SBarry Smith     idt -= 4;
22654e2b4712SSatish Balay   }
226630d4dcafSBarry Smith   }
2267e1293385SBarry Smith #endif
22684e2b4712SSatish Balay 
2269d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2271dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
22724e2b4712SSatish Balay   PetscFunctionReturn(0);
22734e2b4712SSatish Balay }
22744e2b4712SSatish Balay 
2275f26ec98cSKris Buschelman #undef __FUNCT__
2276f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2277dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2278f26ec98cSKris Buschelman {
2279f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2280690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2281dfbe8321SBarry Smith   PetscErrorCode ierr;
2282690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2283f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2284f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2285f26ec98cSKris Buschelman 
2286f26ec98cSKris Buschelman   PetscFunctionBegin;
22871ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2289f26ec98cSKris Buschelman 
2290f26ec98cSKris Buschelman   {
2291f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2292f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2293690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2294f26ec98cSKris Buschelman 
2295f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2296f26ec98cSKris Buschelman     idx  = 0;
2297f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2298f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2299f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2300f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2301f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2302f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2303f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2304f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2305f26ec98cSKris Buschelman       idx   +=  4;
2306f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2307f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2308f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2309f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2310f26ec98cSKris Buschelman       while (nz--) {
2311f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2312f26ec98cSKris Buschelman         x1  = t[jdx];
2313f26ec98cSKris Buschelman         x2  = t[1+jdx];
2314f26ec98cSKris Buschelman         x3  = t[2+jdx];
2315f26ec98cSKris Buschelman         x4  = t[3+jdx];
2316f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2317f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2318f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2319f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2320f26ec98cSKris Buschelman         v    += 16;
2321f26ec98cSKris Buschelman       }
2322f26ec98cSKris Buschelman       t[idx]   = s1;
2323f26ec98cSKris Buschelman       t[1+idx] = s2;
2324f26ec98cSKris Buschelman       t[2+idx] = s3;
2325f26ec98cSKris Buschelman       t[3+idx] = s4;
2326f26ec98cSKris Buschelman     }
2327f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2328f26ec98cSKris Buschelman     idt = 4*(n-1);
2329f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2330f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2331f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2332f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2333f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2334f26ec98cSKris Buschelman       s1   = t[idt];
2335f26ec98cSKris Buschelman       s2   = t[1+idt];
2336f26ec98cSKris Buschelman       s3   = t[2+idt];
2337f26ec98cSKris Buschelman       s4   = t[3+idt];
2338f26ec98cSKris Buschelman       while (nz--) {
2339f26ec98cSKris Buschelman         idx = 4*(*vi++);
2340f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2341f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2342f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2343f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2344f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2345f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2346f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2347f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2348f26ec98cSKris Buschelman         v    += 16;
2349f26ec98cSKris Buschelman       }
2350f26ec98cSKris Buschelman       v        = aa + ai16;
2351f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2352f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2353f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2354f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2355f26ec98cSKris Buschelman       idt -= 4;
2356f26ec98cSKris Buschelman     }
2357f26ec98cSKris Buschelman   }
2358f26ec98cSKris Buschelman 
23591ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
23601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2361dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2362f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2363f26ec98cSKris Buschelman }
2364f26ec98cSKris Buschelman 
23653660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
23663660e330SKris Buschelman 
23673660e330SKris Buschelman #include PETSC_HAVE_SSE
23683660e330SKris Buschelman #undef __FUNCT__
23697cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2370dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
23713660e330SKris Buschelman {
23723660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
23732aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2374dfbe8321SBarry Smith   PetscErrorCode ierr;
2375dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
23763660e330SKris Buschelman   MatScalar      *aa=a->a;
237787828ca2SBarry Smith   PetscScalar    *x,*b;
23783660e330SKris Buschelman 
23793660e330SKris Buschelman   PetscFunctionBegin;
23803660e330SKris Buschelman   SSE_SCOPE_BEGIN;
23813660e330SKris Buschelman   /*
23823660e330SKris Buschelman      Note: This code currently uses demotion of double
23833660e330SKris Buschelman      to float when performing the mixed-mode computation.
23843660e330SKris Buschelman      This may not be numerically reasonable for all applications.
23853660e330SKris Buschelman   */
23863660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
23873660e330SKris Buschelman 
23881ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23891ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23903660e330SKris Buschelman   {
2391eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2392eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
23932aa5897fSKris Buschelman     int            nz,i,idt,ai16;
23942aa5897fSKris Buschelman     unsigned int   jdx,idx;
23952aa5897fSKris Buschelman     unsigned short *vi;
2396eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
23973660e330SKris Buschelman 
2398eb05f457SKris Buschelman     /* First block is the identity. */
23993660e330SKris Buschelman     idx  = 0;
2400eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
24012aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
24023660e330SKris Buschelman 
24033660e330SKris Buschelman     for (i=1; i<n;) {
24043660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
24053660e330SKris Buschelman       vi   =  aj      + ai[i];
24063660e330SKris Buschelman       nz   =  diag[i] - ai[i];
24073660e330SKris Buschelman       idx +=  4;
24083660e330SKris Buschelman 
2409eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2410eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2411eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
24123660e330SKris Buschelman 
24133660e330SKris Buschelman       while (nz--) {
24143660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
24152aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
24163660e330SKris Buschelman 
24173660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2418eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
24193660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
24203660e330SKris Buschelman 
24213660e330SKris Buschelman           /* First Column */
24223660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
24233660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
24243660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
24253660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
24263660e330SKris Buschelman 
24273660e330SKris Buschelman           /* Second Column */
24283660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
24293660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
24303660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
24313660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
24323660e330SKris Buschelman 
24333660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
24343660e330SKris Buschelman 
24353660e330SKris Buschelman           /* Third Column */
24363660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24373660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24383660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24393660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24403660e330SKris Buschelman 
24413660e330SKris Buschelman           /* Fourth Column */
24423660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
24433660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
24443660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
24453660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
24463660e330SKris Buschelman         SSE_INLINE_END_2
24473660e330SKris Buschelman 
24483660e330SKris Buschelman         v  += 16;
24493660e330SKris Buschelman       }
24503660e330SKris Buschelman       v    =  aa + 16*ai[++i];
24513660e330SKris Buschelman       PREFETCH_NTA(v);
2452eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
24533660e330SKris Buschelman     }
2454eb05f457SKris Buschelman 
2455eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2456eb05f457SKris Buschelman 
24573660e330SKris Buschelman     idt  = 4*(n-1);
24583660e330SKris Buschelman     ai16 = 16*diag[n-1];
24593660e330SKris Buschelman     v    = aa + ai16 + 16;
24603660e330SKris Buschelman     for (i=n-1; i>=0;){
24613660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
24623660e330SKris Buschelman       vi = aj + diag[i] + 1;
24633660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
24643660e330SKris Buschelman 
2465eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
24663660e330SKris Buschelman 
24673660e330SKris Buschelman       while (nz--) {
24683660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
24692aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
24703660e330SKris Buschelman 
24713660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2472eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
24733660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
24743660e330SKris Buschelman 
24753660e330SKris Buschelman           /* First Column */
24763660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
24773660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
24783660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
24793660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
24803660e330SKris Buschelman 
24813660e330SKris Buschelman           /* Second Column */
24823660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
24833660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
24843660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
24853660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
24863660e330SKris Buschelman 
24873660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
24883660e330SKris Buschelman 
24893660e330SKris Buschelman           /* Third Column */
24903660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24913660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24923660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24933660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24943660e330SKris Buschelman 
24953660e330SKris Buschelman           /* Fourth Column */
24963660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
24973660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
24983660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
24993660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25003660e330SKris Buschelman         SSE_INLINE_END_2
25013660e330SKris Buschelman         v  += 16;
25023660e330SKris Buschelman       }
25033660e330SKris Buschelman       v    = aa + ai16;
25043660e330SKris Buschelman       ai16 = 16*diag[--i];
25053660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
25063660e330SKris Buschelman       /*
25073660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
25083660e330SKris Buschelman          which was inverted as part of the factorization
25093660e330SKris Buschelman       */
2510eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
25113660e330SKris Buschelman         /* First Column */
25123660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
25133660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
25143660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
25153660e330SKris Buschelman 
25163660e330SKris Buschelman         /* Second Column */
25173660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
25183660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
25193660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
25203660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
25213660e330SKris Buschelman 
25223660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
25233660e330SKris Buschelman 
25243660e330SKris Buschelman         /* Third Column */
25253660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
25263660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
25273660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
25283660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
25293660e330SKris Buschelman 
25303660e330SKris Buschelman         /* Fourth Column */
25313660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
25323660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
25333660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
25343660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
25353660e330SKris Buschelman 
25363660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
25373660e330SKris Buschelman       SSE_INLINE_END_3
25383660e330SKris Buschelman 
25393660e330SKris Buschelman       v    = aa + ai16 + 16;
25403660e330SKris Buschelman       idt -= 4;
25413660e330SKris Buschelman     }
2542eb05f457SKris Buschelman 
2543eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2544eb05f457SKris Buschelman     idt = 4*(n-1);
2545eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2546eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2547eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2548eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2549eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2550eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2551eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2552eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2553eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
255454693613SKris Buschelman       idt -= 4;
25553660e330SKris Buschelman     }
2556eb05f457SKris Buschelman 
2557eb05f457SKris Buschelman   } /* End of artificial scope. */
25581ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
25591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2560dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
25613660e330SKris Buschelman   SSE_SCOPE_END;
25623660e330SKris Buschelman   PetscFunctionReturn(0);
25633660e330SKris Buschelman }
25643660e330SKris Buschelman 
25657cf1b8d3SKris Buschelman #undef __FUNCT__
25667cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2567dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
25687cf1b8d3SKris Buschelman {
25697cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
25707cf1b8d3SKris Buschelman   int            *aj=a->j;
2571dfbe8321SBarry Smith   PetscErrorCode ierr;
2572dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
25737cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
25747cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
25757cf1b8d3SKris Buschelman 
25767cf1b8d3SKris Buschelman   PetscFunctionBegin;
25777cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
25787cf1b8d3SKris Buschelman   /*
25797cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
25807cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
25817cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
25827cf1b8d3SKris Buschelman   */
25837cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
25847cf1b8d3SKris Buschelman 
25851ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
25861ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
25877cf1b8d3SKris Buschelman   {
25887cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
25897cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
25907cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
25917cf1b8d3SKris Buschelman     int       jdx,idx;
25927cf1b8d3SKris Buschelman     int       *vi;
25937cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
25947cf1b8d3SKris Buschelman 
25957cf1b8d3SKris Buschelman     /* First block is the identity. */
25967cf1b8d3SKris Buschelman     idx  = 0;
25977cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
25987cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
25997cf1b8d3SKris Buschelman 
26007cf1b8d3SKris Buschelman     for (i=1; i<n;) {
26017cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
26027cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
26037cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
26047cf1b8d3SKris Buschelman       idx +=  4;
26057cf1b8d3SKris Buschelman 
26067cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
26077cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
26087cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
26097cf1b8d3SKris Buschelman 
26107cf1b8d3SKris Buschelman       while (nz--) {
26117cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
26127cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
26137cf1b8d3SKris Buschelman /*          jdx = *vi++; */
26147cf1b8d3SKris Buschelman 
26157cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
26167cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
26177cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26187cf1b8d3SKris Buschelman 
26197cf1b8d3SKris Buschelman           /* First Column */
26207cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26217cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26227cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26237cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26247cf1b8d3SKris Buschelman 
26257cf1b8d3SKris Buschelman           /* Second Column */
26267cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26277cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26287cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26297cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26307cf1b8d3SKris Buschelman 
26317cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26327cf1b8d3SKris Buschelman 
26337cf1b8d3SKris Buschelman           /* Third Column */
26347cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26357cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26367cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26377cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26387cf1b8d3SKris Buschelman 
26397cf1b8d3SKris Buschelman           /* Fourth Column */
26407cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26417cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26427cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26437cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26447cf1b8d3SKris Buschelman         SSE_INLINE_END_2
26457cf1b8d3SKris Buschelman 
26467cf1b8d3SKris Buschelman         v  += 16;
26477cf1b8d3SKris Buschelman       }
26487cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
26497cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
26507cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
26517cf1b8d3SKris Buschelman     }
26527cf1b8d3SKris Buschelman 
26537cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
26547cf1b8d3SKris Buschelman 
26557cf1b8d3SKris Buschelman     idt  = 4*(n-1);
26567cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
26577cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
26587cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
26597cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
26607cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
26617cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
26627cf1b8d3SKris Buschelman 
26637cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
26647cf1b8d3SKris Buschelman 
26657cf1b8d3SKris Buschelman       while (nz--) {
26667cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
26677cf1b8d3SKris Buschelman         idx = 4*(*vi++);
26687cf1b8d3SKris Buschelman /*          idx = *vi++; */
26697cf1b8d3SKris Buschelman 
26707cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
26717cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
26727cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26737cf1b8d3SKris Buschelman 
26747cf1b8d3SKris Buschelman           /* First Column */
26757cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26767cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26777cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26787cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26797cf1b8d3SKris Buschelman 
26807cf1b8d3SKris Buschelman           /* Second Column */
26817cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26827cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26837cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26847cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26857cf1b8d3SKris Buschelman 
26867cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26877cf1b8d3SKris Buschelman 
26887cf1b8d3SKris Buschelman           /* Third Column */
26897cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26907cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26917cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26927cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26937cf1b8d3SKris Buschelman 
26947cf1b8d3SKris Buschelman           /* Fourth Column */
26957cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
26967cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
26977cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
26987cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
26997cf1b8d3SKris Buschelman         SSE_INLINE_END_2
27007cf1b8d3SKris Buschelman         v  += 16;
27017cf1b8d3SKris Buschelman       }
27027cf1b8d3SKris Buschelman       v    = aa + ai16;
27037cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
27047cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
27057cf1b8d3SKris Buschelman       /*
27067cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
27077cf1b8d3SKris Buschelman          which was inverted as part of the factorization
27087cf1b8d3SKris Buschelman       */
27097cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
27107cf1b8d3SKris Buschelman         /* First Column */
27117cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
27127cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
27137cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
27147cf1b8d3SKris Buschelman 
27157cf1b8d3SKris Buschelman         /* Second Column */
27167cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
27177cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
27187cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
27197cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
27207cf1b8d3SKris Buschelman 
27217cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
27227cf1b8d3SKris Buschelman 
27237cf1b8d3SKris Buschelman         /* Third Column */
27247cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
27257cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
27267cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
27277cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
27287cf1b8d3SKris Buschelman 
27297cf1b8d3SKris Buschelman         /* Fourth Column */
27307cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
27317cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
27327cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
27337cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
27347cf1b8d3SKris Buschelman 
27357cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
27367cf1b8d3SKris Buschelman       SSE_INLINE_END_3
27377cf1b8d3SKris Buschelman 
27387cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
27397cf1b8d3SKris Buschelman       idt -= 4;
27407cf1b8d3SKris Buschelman     }
27417cf1b8d3SKris Buschelman 
27427cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
27437cf1b8d3SKris Buschelman     idt = 4*(n-1);
27447cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
27457cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
27467cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
27477cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
27487cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
27497cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
27507cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
27517cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
27527cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
27537cf1b8d3SKris Buschelman       idt -= 4;
27547cf1b8d3SKris Buschelman     }
27557cf1b8d3SKris Buschelman 
27567cf1b8d3SKris Buschelman   } /* End of artificial scope. */
27571ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
27581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2759dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
27607cf1b8d3SKris Buschelman   SSE_SCOPE_END;
27617cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
27627cf1b8d3SKris Buschelman }
27637cf1b8d3SKris Buschelman 
27643660e330SKris Buschelman #endif
27654a2ae208SSatish Balay #undef __FUNCT__
27664a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2767dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
27684e2b4712SSatish Balay {
27694e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
27704e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
27716849ba73SBarry Smith   PetscErrorCode    ierr;
27725d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
27735d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2774d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2775d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2776d9fead3dSBarry Smith   const PetscScalar *b;
27774e2b4712SSatish Balay 
27784e2b4712SSatish Balay   PetscFunctionBegin;
2779d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2781f1af5d2fSBarry Smith   t  = a->solve_work;
27824e2b4712SSatish Balay 
27834e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
27844e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
27854e2b4712SSatish Balay 
27864e2b4712SSatish Balay   /* forward solve the lower triangular */
27874e2b4712SSatish Balay   idx    = 3*(*r++);
2788f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
27894e2b4712SSatish Balay   for (i=1; i<n; i++) {
27904e2b4712SSatish Balay     v     = aa + 9*ai[i];
27914e2b4712SSatish Balay     vi    = aj + ai[i];
27924e2b4712SSatish Balay     nz    = diag[i] - ai[i];
27934e2b4712SSatish Balay     idx   = 3*(*r++);
2794f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
27954e2b4712SSatish Balay     while (nz--) {
27964e2b4712SSatish Balay       idx   = 3*(*vi++);
2797f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2798f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2799f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2800f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
28014e2b4712SSatish Balay       v += 9;
28024e2b4712SSatish Balay     }
28034e2b4712SSatish Balay     idx = 3*i;
2804f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
28054e2b4712SSatish Balay   }
28064e2b4712SSatish Balay   /* backward solve the upper triangular */
28074e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28084e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
28094e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28104e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28114e2b4712SSatish Balay     idt  = 3*i;
2812f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
28134e2b4712SSatish Balay     while (nz--) {
28144e2b4712SSatish Balay       idx   = 3*(*vi++);
2815f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2816f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2817f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2818f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
28194e2b4712SSatish Balay       v += 9;
28204e2b4712SSatish Balay     }
28214e2b4712SSatish Balay     idc = 3*(*c--);
28224e2b4712SSatish Balay     v   = aa + 9*diag[i];
2823f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2824f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2825f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
28264e2b4712SSatish Balay   }
28274e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28284e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2829d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28301ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2831dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
28324e2b4712SSatish Balay   PetscFunctionReturn(0);
28334e2b4712SSatish Balay }
28344e2b4712SSatish Balay 
283515091d37SBarry Smith /*
283615091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
283715091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
283815091d37SBarry Smith */
28394a2ae208SSatish Balay #undef __FUNCT__
28404a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2841dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
284215091d37SBarry Smith {
284315091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2844690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2845dfbe8321SBarry Smith   PetscErrorCode    ierr;
2846690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2847d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2848d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2849d9fead3dSBarry Smith   const PetscScalar *b;
2850690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
285115091d37SBarry Smith 
285215091d37SBarry Smith   PetscFunctionBegin;
2853d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28541ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
285515091d37SBarry Smith 
285615091d37SBarry Smith   /* forward solve the lower triangular */
285715091d37SBarry Smith   idx    = 0;
285815091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
285915091d37SBarry Smith   for (i=1; i<n; i++) {
286015091d37SBarry Smith     v     =  aa      + 9*ai[i];
286115091d37SBarry Smith     vi    =  aj      + ai[i];
286215091d37SBarry Smith     nz    =  diag[i] - ai[i];
286315091d37SBarry Smith     idx   +=  3;
2864f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
286515091d37SBarry Smith     while (nz--) {
286615091d37SBarry Smith       jdx   = 3*(*vi++);
286715091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2868f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2869f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2870f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
287115091d37SBarry Smith       v    += 9;
287215091d37SBarry Smith     }
2873f1af5d2fSBarry Smith     x[idx]   = s1;
2874f1af5d2fSBarry Smith     x[1+idx] = s2;
2875f1af5d2fSBarry Smith     x[2+idx] = s3;
287615091d37SBarry Smith   }
287715091d37SBarry Smith   /* backward solve the upper triangular */
287815091d37SBarry Smith   for (i=n-1; i>=0; i--){
287915091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
288015091d37SBarry Smith     vi   = aj + diag[i] + 1;
288115091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
288215091d37SBarry Smith     idt  = 3*i;
2883f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2884f1af5d2fSBarry Smith     s3 = x[2+idt];
288515091d37SBarry Smith     while (nz--) {
288615091d37SBarry Smith       idx   = 3*(*vi++);
288715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2888f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2889f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2890f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
289115091d37SBarry Smith       v    += 9;
289215091d37SBarry Smith     }
289315091d37SBarry Smith     v        = aa +  9*diag[i];
2894f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2895f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2896f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
289715091d37SBarry Smith   }
289815091d37SBarry Smith 
2899d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29001ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2901dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
290215091d37SBarry Smith   PetscFunctionReturn(0);
290315091d37SBarry Smith }
290415091d37SBarry Smith 
29054a2ae208SSatish Balay #undef __FUNCT__
29064a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2907dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
29084e2b4712SSatish Balay {
29094e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
29104e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
29116849ba73SBarry Smith   PetscErrorCode    ierr;
29125d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
29135d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2914d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2915d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
2916d9fead3dSBarry Smith   const PetscScalar *b;
29174e2b4712SSatish Balay 
29184e2b4712SSatish Balay   PetscFunctionBegin;
2919d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29201ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2921f1af5d2fSBarry Smith   t  = a->solve_work;
29224e2b4712SSatish Balay 
29234e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29244e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29254e2b4712SSatish Balay 
29264e2b4712SSatish Balay   /* forward solve the lower triangular */
29274e2b4712SSatish Balay   idx    = 2*(*r++);
2928f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
29294e2b4712SSatish Balay   for (i=1; i<n; i++) {
29304e2b4712SSatish Balay     v     = aa + 4*ai[i];
29314e2b4712SSatish Balay     vi    = aj + ai[i];
29324e2b4712SSatish Balay     nz    = diag[i] - ai[i];
29334e2b4712SSatish Balay     idx   = 2*(*r++);
2934f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
29354e2b4712SSatish Balay     while (nz--) {
29364e2b4712SSatish Balay       idx   = 2*(*vi++);
2937f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2938f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2939f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
29404e2b4712SSatish Balay       v += 4;
29414e2b4712SSatish Balay     }
29424e2b4712SSatish Balay     idx = 2*i;
2943f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
29444e2b4712SSatish Balay   }
29454e2b4712SSatish Balay   /* backward solve the upper triangular */
29464e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
29474e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
29484e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
29494e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
29504e2b4712SSatish Balay     idt  = 2*i;
2951f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
29524e2b4712SSatish Balay     while (nz--) {
29534e2b4712SSatish Balay       idx   = 2*(*vi++);
2954f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2955f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2956f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
29574e2b4712SSatish Balay       v += 4;
29584e2b4712SSatish Balay     }
29594e2b4712SSatish Balay     idc = 2*(*c--);
29604e2b4712SSatish Balay     v   = aa + 4*diag[i];
2961f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
2962f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
29634e2b4712SSatish Balay   }
29644e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
29654e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2966d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29671ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2968dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
29694e2b4712SSatish Balay   PetscFunctionReturn(0);
29704e2b4712SSatish Balay }
29714e2b4712SSatish Balay 
297215091d37SBarry Smith /*
297315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
297415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
297515091d37SBarry Smith */
29764a2ae208SSatish Balay #undef __FUNCT__
29774a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
2978dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
297915091d37SBarry Smith {
298015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2981690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2982dfbe8321SBarry Smith   PetscErrorCode    ierr;
2983690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2984d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2985d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
2986d9fead3dSBarry Smith   const PetscScalar *b;
2987690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
298815091d37SBarry Smith 
298915091d37SBarry Smith   PetscFunctionBegin;
2990d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
299215091d37SBarry Smith 
299315091d37SBarry Smith   /* forward solve the lower triangular */
299415091d37SBarry Smith   idx    = 0;
299515091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
299615091d37SBarry Smith   for (i=1; i<n; i++) {
299715091d37SBarry Smith     v     =  aa      + 4*ai[i];
299815091d37SBarry Smith     vi    =  aj      + ai[i];
299915091d37SBarry Smith     nz    =  diag[i] - ai[i];
300015091d37SBarry Smith     idx   +=  2;
3001f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
300215091d37SBarry Smith     while (nz--) {
300315091d37SBarry Smith       jdx   = 2*(*vi++);
300415091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
3005f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3006f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
300715091d37SBarry Smith       v    += 4;
300815091d37SBarry Smith     }
3009f1af5d2fSBarry Smith     x[idx]   = s1;
3010f1af5d2fSBarry Smith     x[1+idx] = s2;
301115091d37SBarry Smith   }
301215091d37SBarry Smith   /* backward solve the upper triangular */
301315091d37SBarry Smith   for (i=n-1; i>=0; i--){
301415091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
301515091d37SBarry Smith     vi   = aj + diag[i] + 1;
301615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
301715091d37SBarry Smith     idt  = 2*i;
3018f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
301915091d37SBarry Smith     while (nz--) {
302015091d37SBarry Smith       idx   = 2*(*vi++);
302115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
3022f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3023f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
302415091d37SBarry Smith       v    += 4;
302515091d37SBarry Smith     }
302615091d37SBarry Smith     v        = aa +  4*diag[i];
3027f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
3028f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
302915091d37SBarry Smith   }
303015091d37SBarry Smith 
3031d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30321ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3033dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
303415091d37SBarry Smith   PetscFunctionReturn(0);
303515091d37SBarry Smith }
303615091d37SBarry Smith 
30374a2ae208SSatish Balay #undef __FUNCT__
30384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3039dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
30404e2b4712SSatish Balay {
30414e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
30424e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
30436849ba73SBarry Smith   PetscErrorCode ierr;
30445d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
30455d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
30463f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
304787828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
30484e2b4712SSatish Balay 
30494e2b4712SSatish Balay   PetscFunctionBegin;
30504e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
30514e2b4712SSatish Balay 
30521ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3054f1af5d2fSBarry Smith   t  = a->solve_work;
30554e2b4712SSatish Balay 
30564e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30574e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
30584e2b4712SSatish Balay 
30594e2b4712SSatish Balay   /* forward solve the lower triangular */
3060f1af5d2fSBarry Smith   t[0] = b[*r++];
30614e2b4712SSatish Balay   for (i=1; i<n; i++) {
30624e2b4712SSatish Balay     v     = aa + ai[i];
30634e2b4712SSatish Balay     vi    = aj + ai[i];
30644e2b4712SSatish Balay     nz    = diag[i] - ai[i];
3065f1af5d2fSBarry Smith     s1  = b[*r++];
30664e2b4712SSatish Balay     while (nz--) {
3067f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
30684e2b4712SSatish Balay     }
3069f1af5d2fSBarry Smith     t[i] = s1;
30704e2b4712SSatish Balay   }
30714e2b4712SSatish Balay   /* backward solve the upper triangular */
30724e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30734e2b4712SSatish Balay     v    = aa + diag[i] + 1;
30744e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30754e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3076f1af5d2fSBarry Smith     s1 = t[i];
30774e2b4712SSatish Balay     while (nz--) {
3078f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
30794e2b4712SSatish Balay     }
3080f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
30814e2b4712SSatish Balay   }
30824e2b4712SSatish Balay 
30834e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30844e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30851ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
30861ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3087dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
30884e2b4712SSatish Balay   PetscFunctionReturn(0);
30894e2b4712SSatish Balay }
309015091d37SBarry Smith /*
309115091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
309215091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
309315091d37SBarry Smith */
30944a2ae208SSatish Balay #undef __FUNCT__
30954a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3096dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
309715091d37SBarry Smith {
309815091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3099690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3100dfbe8321SBarry Smith   PetscErrorCode ierr;
3101690b6cddSBarry Smith   PetscInt       *diag = a->diag;
310215091d37SBarry Smith   MatScalar      *aa=a->a;
310387828ca2SBarry Smith   PetscScalar    *x,*b;
310487828ca2SBarry Smith   PetscScalar    s1,x1;
310515091d37SBarry Smith   MatScalar      *v;
3106690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
310715091d37SBarry Smith 
310815091d37SBarry Smith   PetscFunctionBegin;
31091ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
31101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
311115091d37SBarry Smith 
311215091d37SBarry Smith   /* forward solve the lower triangular */
311315091d37SBarry Smith   idx    = 0;
311415091d37SBarry Smith   x[0]   = b[0];
311515091d37SBarry Smith   for (i=1; i<n; i++) {
311615091d37SBarry Smith     v     =  aa      + ai[i];
311715091d37SBarry Smith     vi    =  aj      + ai[i];
311815091d37SBarry Smith     nz    =  diag[i] - ai[i];
311915091d37SBarry Smith     idx   +=  1;
3120f1af5d2fSBarry Smith     s1  =  b[idx];
312115091d37SBarry Smith     while (nz--) {
312215091d37SBarry Smith       jdx   = *vi++;
312315091d37SBarry Smith       x1    = x[jdx];
3124f1af5d2fSBarry Smith       s1 -= v[0]*x1;
312515091d37SBarry Smith       v    += 1;
312615091d37SBarry Smith     }
3127f1af5d2fSBarry Smith     x[idx]   = s1;
312815091d37SBarry Smith   }
312915091d37SBarry Smith   /* backward solve the upper triangular */
313015091d37SBarry Smith   for (i=n-1; i>=0; i--){
313115091d37SBarry Smith     v    = aa + diag[i] + 1;
313215091d37SBarry Smith     vi   = aj + diag[i] + 1;
313315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
313415091d37SBarry Smith     idt  = i;
3135f1af5d2fSBarry Smith     s1 = x[idt];
313615091d37SBarry Smith     while (nz--) {
313715091d37SBarry Smith       idx   = *vi++;
313815091d37SBarry Smith       x1    = x[idx];
3139f1af5d2fSBarry Smith       s1 -= v[0]*x1;
314015091d37SBarry Smith       v    += 1;
314115091d37SBarry Smith     }
314215091d37SBarry Smith     v        = aa +  diag[i];
3143f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
314415091d37SBarry Smith   }
31451ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
31461ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3147dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
314815091d37SBarry Smith   PetscFunctionReturn(0);
314915091d37SBarry Smith }
31504e2b4712SSatish Balay 
31514e2b4712SSatish Balay /* ----------------------------------------------------------------*/
31526bce7ff8SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
31536bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
31546bce7ff8SHong Zhang 
315584a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
31566bce7ff8SHong Zhang #undef __FUNCT__
31576bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
31586bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
31596bce7ff8SHong Zhang {
31606bce7ff8SHong Zhang   Mat            C=B;
31616bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
31626bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
31636bce7ff8SHong Zhang   PetscErrorCode ierr;
31646bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
31656bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
31666bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3167914a18a2SHong Zhang   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3168914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3169914a18a2SHong Zhang   MatScalar      *v_work;
31706bce7ff8SHong Zhang 
31716bce7ff8SHong Zhang   PetscFunctionBegin;
31726bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
31736bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3174914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3175914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
31766bce7ff8SHong Zhang   ics  = ic;
31776bce7ff8SHong Zhang 
3178914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
3179914a18a2SHong Zhang   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3180914a18a2SHong Zhang   multiplier = v_work + bs;
3181914a18a2SHong Zhang   v_pivots   = (PetscInt*)(multiplier + bs2);
3182914a18a2SHong Zhang 
31836bce7ff8SHong Zhang   for (i=0; i<n; i++){
31846bce7ff8SHong Zhang     /* zero rtmp */
31856bce7ff8SHong Zhang     /* L part */
31866bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
31876bce7ff8SHong Zhang     bjtmp = bj + bi[i];
3188914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3189914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3190914a18a2SHong Zhang     }
31916bce7ff8SHong Zhang 
31926bce7ff8SHong Zhang     /* U part */
31936bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
31946bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
3195914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3196914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3197914a18a2SHong Zhang     }
31986bce7ff8SHong Zhang 
31996bce7ff8SHong Zhang     /* load in initial (unfactored row) */
32006bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
32016bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
3202914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
32036bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3204914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
32056bce7ff8SHong Zhang     }
32066bce7ff8SHong Zhang 
32076bce7ff8SHong Zhang     /* elimination */
32086bce7ff8SHong Zhang     bjtmp = bj + bi[i];
32096bce7ff8SHong Zhang     row   = *bjtmp++;
32106bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
32116bce7ff8SHong Zhang     k   = 0;
32126bce7ff8SHong Zhang     while  (k < nzL) {
3213914a18a2SHong Zhang       pc = rtmp + bs2*row;
3214914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3215914a18a2SHong Zhang       if (flg) {
3216914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
3217914a18a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
32186bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3219914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
32206bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3221914a18a2SHong Zhang         for (j=0; j<nz; j++) {
3222914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3223914a18a2SHong Zhang         }
32246bce7ff8SHong Zhang         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
32256bce7ff8SHong Zhang       }
32266bce7ff8SHong Zhang       row = *bjtmp++; k++;
32276bce7ff8SHong Zhang     }
32286bce7ff8SHong Zhang 
32296bce7ff8SHong Zhang     /* finished row so stick it into b->a */
32306bce7ff8SHong Zhang     /* L part */
3231914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
32326bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
32336bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
32346bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3235914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
32366bce7ff8SHong Zhang     }
32376bce7ff8SHong Zhang 
32386bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
3239914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
32406bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
3241914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3242914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3243914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
32446bce7ff8SHong Zhang 
32456bce7ff8SHong Zhang     /* U part */
3246914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
32476bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
32486bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3249914a18a2SHong Zhang     for (j=0; j<nz; j++){
3250914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3251914a18a2SHong Zhang     }
32526bce7ff8SHong Zhang   }
32536bce7ff8SHong Zhang 
32546bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
32556bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
32566bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
32576bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
325884a281e5SHong Zhang   if (bs == 5){
325984a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
326084a281e5SHong Zhang   } else {
326184a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
326284a281e5SHong Zhang   }
32636bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
3264914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
32656bce7ff8SHong Zhang   PetscFunctionReturn(0);
32666bce7ff8SHong Zhang }
32676bce7ff8SHong Zhang 
32686bce7ff8SHong Zhang /*
32696bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
32706bce7ff8SHong Zhang    Factored arrays bj and ba are stored as
32716bce7ff8SHong Zhang      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
32726bce7ff8SHong Zhang 
32736bce7ff8SHong Zhang    bi=fact->i is an array of size 2n+2, in which
32746bce7ff8SHong Zhang    bi+
32756bce7ff8SHong Zhang      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
32766bce7ff8SHong Zhang      bi[n]      ->  end of L(n-1,:)+1
32776bce7ff8SHong Zhang      bi[n+1]    ->  1st entry of U(n-1,:)
32786bce7ff8SHong Zhang      bi[2n-i]   ->  1st entry of U(i,:)
32796bce7ff8SHong Zhang      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
32806bce7ff8SHong Zhang      bi[2n]     ->  end of U(0,:)+1
32816bce7ff8SHong Zhang 
32826bce7ff8SHong Zhang    U(i,:) contains diag[i] as its last entry, i.e.,
32836bce7ff8SHong Zhang     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
32846bce7ff8SHong Zhang */
32856bce7ff8SHong Zhang #undef __FUNCT__
32866bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
32876bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
32886bce7ff8SHong Zhang {
32896bce7ff8SHong Zhang 
32906bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
32916bce7ff8SHong Zhang   PetscErrorCode     ierr;
3292914a18a2SHong Zhang   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
32936bce7ff8SHong Zhang   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
32946bce7ff8SHong Zhang 
32956bce7ff8SHong Zhang   PetscFunctionBegin;
32966bce7ff8SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
32976bce7ff8SHong Zhang   b     = (Mat_SeqBAIJ*)(fact)->data;
3298914a18a2SHong Zhang   bdiag = b->diag;
32996bce7ff8SHong Zhang 
33006bce7ff8SHong Zhang   /* replace matrix arrays with single allocations, then reset values */
33016bce7ff8SHong Zhang   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
33026bce7ff8SHong Zhang 
33036bce7ff8SHong Zhang   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
33046bce7ff8SHong Zhang   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
33056bce7ff8SHong Zhang   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
33066bce7ff8SHong Zhang   b->singlemalloc = PETSC_FALSE;
33076bce7ff8SHong Zhang   if (mbs > 0) {
33086bce7ff8SHong Zhang     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
33096bce7ff8SHong Zhang   }
33106bce7ff8SHong Zhang 
33116bce7ff8SHong Zhang   /* set bi and bj with new data structure */
33126bce7ff8SHong Zhang   bi = b->i;
33136bce7ff8SHong Zhang   bj = b->j;
33146bce7ff8SHong Zhang 
33156bce7ff8SHong Zhang   /* L part */
33166bce7ff8SHong Zhang   bi[0] = 0;
33176bce7ff8SHong Zhang   for (i=0; i<mbs; i++){
33186bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
3319914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
33206bce7ff8SHong Zhang     aj = a->j + ai[i];
33216bce7ff8SHong Zhang     for (j=0; j<nz; j++){
33226bce7ff8SHong Zhang       *bj = aj[j]; bj++;
33236bce7ff8SHong Zhang     }
33246bce7ff8SHong Zhang   }
33256bce7ff8SHong Zhang 
33266bce7ff8SHong Zhang   /* U part */
33276bce7ff8SHong Zhang   bi[mbs+1] = bi[mbs];
33286bce7ff8SHong Zhang   for (i=mbs-1; i>=0; i--){
33296bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
33306bce7ff8SHong Zhang     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3331914a18a2SHong Zhang     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
33326bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
33336bce7ff8SHong Zhang     for (j=0; j<nz; j++){
33346bce7ff8SHong Zhang       *bj = aj[j]; bj++;
33356bce7ff8SHong Zhang     }
33366bce7ff8SHong Zhang     /* diag[i] */
33376bce7ff8SHong Zhang     *bj = i; bj++;
33386bce7ff8SHong Zhang     bdiag[i] = bi[2*mbs-i+1]-1;
33396bce7ff8SHong Zhang   }
33406bce7ff8SHong Zhang   PetscFunctionReturn(0);
33416bce7ff8SHong Zhang }
33426bce7ff8SHong Zhang 
33434e2b4712SSatish Balay /*
33444e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
33454e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
33464e2b4712SSatish Balay    Not a good example of code reuse.
33474e2b4712SSatish Balay */
3348435faa5fSBarry Smith 
33494a2ae208SSatish Balay #undef __FUNCT__
33504a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
33510481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
33524e2b4712SSatish Balay {
33534e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
33544e2b4712SSatish Balay   IS             isicol;
33556849ba73SBarry Smith   PetscErrorCode ierr;
33565d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
33575d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3358a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3359d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
336041df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
3361329f5518SBarry Smith   PetscReal      f;
33624e2b4712SSatish Balay 
33634e2b4712SSatish Balay   PetscFunctionBegin;
33646bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
33656bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
33666bce7ff8SHong Zhang 
3367435faa5fSBarry Smith   f             = info->fill;
3368690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
3369690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
33704c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3371667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3372667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
33737d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
3374309c388cSBarry Smith 
337541df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
33766bce7ff8SHong Zhang 
33776bce7ff8SHong Zhang     PetscTruth newdatastruct=PETSC_FALSE;
33786bce7ff8SHong Zhang     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
33796bce7ff8SHong Zhang     if (newdatastruct){
33806bce7ff8SHong Zhang       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
33816bce7ff8SHong Zhang       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
33826bce7ff8SHong Zhang     } else {
3383719d5645SBarry Smith       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
33846bce7ff8SHong Zhang       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
33856bce7ff8SHong Zhang     }
33866bce7ff8SHong Zhang 
3387719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
3388719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
3389bb3d539aSBarry Smith     b->row       = isrow;
3390bb3d539aSBarry Smith     b->col       = iscol;
3391bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3392bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3393bb3d539aSBarry Smith     b->icol      = isicol;
3394bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3395719d5645SBarry Smith     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
33966bce7ff8SHong Zhang     PetscFunctionReturn(0);
33976bce7ff8SHong Zhang   }
33986bce7ff8SHong Zhang 
33996bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
34004e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34014e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
34024e2b4712SSatish Balay 
34034e2b4712SSatish Balay     /* get new row pointers */
3404690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
34054e2b4712SSatish Balay     ainew[0] = 0;
34064e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
3407690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
3408690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
34094e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
3410690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
34114e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3412690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
34134e2b4712SSatish Balay     /* im is level for each filled value */
3414690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
34154e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3416690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
34174e2b4712SSatish Balay     dloc[0]  = 0;
34184e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3419435faa5fSBarry Smith 
3420435faa5fSBarry Smith       /* copy prow into linked list */
34214e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
34223b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
34234e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
34244e2b4712SSatish Balay       fill[n]    = n;
3425435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
34264e2b4712SSatish Balay       while (nz--) {
34274e2b4712SSatish Balay 	fm  = n;
34284e2b4712SSatish Balay 	idx = ic[*xi++];
34294e2b4712SSatish Balay 	do {
34304e2b4712SSatish Balay 	  m  = fm;
34314e2b4712SSatish Balay 	  fm = fill[m];
34324e2b4712SSatish Balay 	} while (fm < idx);
34334e2b4712SSatish Balay 	fill[m]   = idx;
34344e2b4712SSatish Balay 	fill[idx] = fm;
34354e2b4712SSatish Balay 	im[idx]   = 0;
34364e2b4712SSatish Balay       }
3437435faa5fSBarry Smith 
3438435faa5fSBarry Smith       /* make sure diagonal entry is included */
3439435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3440435faa5fSBarry Smith 	fm = n;
3441435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3442435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3443435faa5fSBarry Smith 	fill[fm]   = prow;
3444435faa5fSBarry Smith 	im[prow]   = 0;
3445435faa5fSBarry Smith 	nzf++;
3446335d9088SBarry Smith 	dcount++;
3447435faa5fSBarry Smith       }
3448435faa5fSBarry Smith 
34494e2b4712SSatish Balay       nzi = 0;
34504e2b4712SSatish Balay       row = fill[n];
34514e2b4712SSatish Balay       while (row < prow) {
34524e2b4712SSatish Balay 	incrlev = im[row] + 1;
34534e2b4712SSatish Balay 	nz      = dloc[row];
3454435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
34554e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
34564e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
34574e2b4712SSatish Balay 	fm      = row;
34584e2b4712SSatish Balay 	while (nnz-- > 0) {
34594e2b4712SSatish Balay 	  idx = *xi++;
34604e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
34614e2b4712SSatish Balay 	    flev++;
34624e2b4712SSatish Balay 	    continue;
34634e2b4712SSatish Balay 	  }
34644e2b4712SSatish Balay 	  do {
34654e2b4712SSatish Balay 	    m  = fm;
34664e2b4712SSatish Balay 	    fm = fill[m];
34674e2b4712SSatish Balay 	  } while (fm < idx);
34684e2b4712SSatish Balay 	  if (fm != idx) {
34694e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
34704e2b4712SSatish Balay 	    fill[m]   = idx;
34714e2b4712SSatish Balay 	    fill[idx] = fm;
34724e2b4712SSatish Balay 	    fm        = idx;
34734e2b4712SSatish Balay 	    nzf++;
3474ecf371e4SBarry Smith 	  } else {
34754e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
34764e2b4712SSatish Balay 	  }
34774e2b4712SSatish Balay 	  flev++;
34784e2b4712SSatish Balay 	}
34794e2b4712SSatish Balay 	row = fill[row];
34804e2b4712SSatish Balay 	nzi++;
34814e2b4712SSatish Balay       }
34824e2b4712SSatish Balay       /* copy new filled row into permanent storage */
34834e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
34844e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3485ecf371e4SBarry Smith 
3486ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3487ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3488ecf371e4SBarry Smith 	/* just double the memory each time */
3489690b6cddSBarry Smith 	PetscInt maxadd = jmax;
3490ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
34914e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
34924e2b4712SSatish Balay 	jmax += maxadd;
3493ecf371e4SBarry Smith 
3494ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
34955d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
34965d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3497606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
34985d0c19d7SBarry Smith 	ajnew = xitmp;
34995d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
35005d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3501606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
35025d0c19d7SBarry Smith 	ajfill = xitmp;
3503eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
35044e2b4712SSatish Balay       }
35055d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
35064e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
35074e2b4712SSatish Balay       dloc[prow]  = nzi;
35084e2b4712SSatish Balay       fm          = fill[n];
35094e2b4712SSatish Balay       while (nzf--) {
35105d0c19d7SBarry Smith 	*xitmp++ = fm;
35114e2b4712SSatish Balay 	*flev++ = im[fm];
35124e2b4712SSatish Balay 	fm      = fill[fm];
35134e2b4712SSatish Balay       }
3514435faa5fSBarry Smith       /* make sure row has diagonal entry */
3515435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
351677431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
35172401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3518435faa5fSBarry Smith       }
35194e2b4712SSatish Balay     }
3520606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
35214e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
35224e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3523606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3524606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
35254e2b4712SSatish Balay 
35266cf91177SBarry Smith #if defined(PETSC_USE_INFO)
35274e2b4712SSatish Balay     {
3528329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3529ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3530ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3531ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3532ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3533335d9088SBarry Smith       if (diagonal_fill) {
3534ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3535335d9088SBarry Smith       }
35364e2b4712SSatish Balay     }
353763ba0a88SBarry Smith #endif
35384e2b4712SSatish Balay 
35394e2b4712SSatish Balay     /* put together the new matrix */
3540719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3541719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3542719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
3543e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
3544e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
35457c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
3546a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
35474e2b4712SSatish Balay     b->j          = ajnew;
35484e2b4712SSatish Balay     b->i          = ainew;
35494e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
35504e2b4712SSatish Balay     b->diag       = dloc;
35514e2b4712SSatish Balay     b->ilen       = 0;
35524e2b4712SSatish Balay     b->imax       = 0;
35534e2b4712SSatish Balay     b->row        = isrow;
35544e2b4712SSatish Balay     b->col        = iscol;
3555bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3556c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3557c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3558e51c0b9cSSatish Balay     b->icol       = isicol;
355987828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
35604e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
35614e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
3562719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
35634e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
35644e2b4712SSatish Balay 
3565719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
3566719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
3567719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
35686bce7ff8SHong Zhang 
356941df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
35708661488fSKris Buschelman   PetscFunctionReturn(0);
35718661488fSKris Buschelman }
35728661488fSKris Buschelman 
3573732ee342SKris Buschelman #undef __FUNCT__
35747e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3575dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
35767e7071cdSKris Buschelman {
357712272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
357812272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
35795a9542e3SKris Buschelman   PetscFunctionBegin;
35807cf1b8d3SKris Buschelman   /* Undo Column scaling */
35817cf1b8d3SKris Buschelman /*    while (nz--) { */
35827cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
35837cf1b8d3SKris Buschelman /*    } */
3584c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3585c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
35867cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
35877cf1b8d3SKris Buschelman }
35887cf1b8d3SKris Buschelman 
35897cf1b8d3SKris Buschelman #undef __FUNCT__
35907cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3591dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
35927cf1b8d3SKris Buschelman {
35937cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3594b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
35952aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
35965a9542e3SKris Buschelman   PetscFunctionBegin;
35970b9da03eSKris Buschelman   /* Is this really necessary? */
359820235379SKris Buschelman   while (nz--) {
35990b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
36007e7071cdSKris Buschelman   }
3601c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
36027e7071cdSKris Buschelman   PetscFunctionReturn(0);
36037e7071cdSKris Buschelman }
36047e7071cdSKris Buschelman 
3605732ee342SKris Buschelman 
3606