xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 270193593457fdfb6a4b65101c096434000e663c)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
104e2b4712SSatish Balay 
114a2ae208SSatish Balay #undef __FUNCT__
124a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
13dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
14f1af5d2fSBarry Smith {
15f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
16dfbe8321SBarry Smith   PetscErrorCode ierr;
17690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
18690b6cddSBarry Smith   PetscInt       *diag = a->diag;
19f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2087828ca2SBarry Smith   PetscScalar    s1,*x,*b;
21f1af5d2fSBarry Smith 
22f1af5d2fSBarry Smith   PetscFunctionBegin;
23ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26f1af5d2fSBarry Smith 
27f1af5d2fSBarry Smith   /* forward solve the U^T */
28f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
29f1af5d2fSBarry Smith 
30f1af5d2fSBarry Smith     v     = aa + diag[i];
31f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
32ef66eb69SBarry Smith     s1    = (*v++)*x[i];
33f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
34f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
35f1af5d2fSBarry Smith     while (nz--) {
36f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
37f1af5d2fSBarry Smith     }
38f1af5d2fSBarry Smith     x[i]   = s1;
39f1af5d2fSBarry Smith   }
40f1af5d2fSBarry Smith   /* backward solve the L^T */
41f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
42f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
43f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
44f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
45f1af5d2fSBarry Smith     s1   = x[i];
46f1af5d2fSBarry Smith     while (nz--) {
47f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
48f1af5d2fSBarry Smith     }
49f1af5d2fSBarry Smith   }
501ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
511ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
52dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
53f1af5d2fSBarry Smith   PetscFunctionReturn(0);
54f1af5d2fSBarry Smith }
55f1af5d2fSBarry Smith 
56*27019359SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
57*27019359SHong Zhang {
58*27019359SHong Zhang     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
59*27019359SHong Zhang     PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
60*27019359SHong Zhang     PetscErrorCode    ierr;
61*27019359SHong Zhang     PetscInt          jdx;
62*27019359SHong Zhang     const MatScalar   *aa=a->a,*v;
63*27019359SHong Zhang     PetscScalar       *x,s1,s2,x1,x2;
64*27019359SHong Zhang     const PetscScalar *b;
65*27019359SHong Zhang 
66*27019359SHong Zhang     PetscFunctionBegin;
67*27019359SHong Zhang     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
68*27019359SHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
69*27019359SHong Zhang     /* forward solve the lower triangular */
70*27019359SHong Zhang     idx    = 0;
71*27019359SHong Zhang     x[0] = b[idx]; x[1] = b[1+idx];
72*27019359SHong Zhang     for (i=1; i<n; i++) {
73*27019359SHong Zhang         v   = aa + 4*ai[i];
74*27019359SHong Zhang        vi   = aj + ai[i];
75*27019359SHong Zhang        nz   = ai[i+1] - ai[i];
76*27019359SHong Zhang        idx  = 2*i;
77*27019359SHong Zhang        s1   = b[idx];s2 = b[1+idx];
78*27019359SHong Zhang        while (nz--) {
79*27019359SHong Zhang           jdx   = 2*(*vi++);
80*27019359SHong Zhang           x1    = x[jdx];x2 = x[1+jdx];
81*27019359SHong Zhang           s1   -= v[0]*x1 + v[2]*x2;
82*27019359SHong Zhang           s2   -= v[1]*x1 + v[3]*x2;
83*27019359SHong Zhang            v   +=  4;
84*27019359SHong Zhang         }
85*27019359SHong Zhang        x[idx]   = s1;
86*27019359SHong Zhang        x[1+idx] = s2;
87*27019359SHong Zhang     }
88*27019359SHong Zhang 
89*27019359SHong Zhang    /* backward solve the upper triangular */
90*27019359SHong Zhang   for (i=n-1; i>=0; i--){
91*27019359SHong Zhang      v   = aa + 4*ai[2*n-i];
92*27019359SHong Zhang      vi  = aj + ai[2*n-i];
93*27019359SHong Zhang      nz  = ai[2*n-i +1] - ai[2*n-i]-1;
94*27019359SHong Zhang      idt = 2*i;
95*27019359SHong Zhang      s1 = x[idt];  s2 = x[1+idt];
96*27019359SHong Zhang      while (nz--) {
97*27019359SHong Zhang       idx   = 2*(*vi++);
98*27019359SHong Zhang        x1    = x[idx];   x2 = x[1+idx];
99*27019359SHong Zhang        s1 -= v[0]*x1 + v[2]*x2;
100*27019359SHong Zhang        s2 -= v[1]*x1 + v[3]*x2;
101*27019359SHong Zhang          v    += 4;
102*27019359SHong Zhang     }
103*27019359SHong Zhang     /* x = inv_diagonal*x */
104*27019359SHong Zhang    x[idt]   = v[0]*s1 + v[2]*s2;
105*27019359SHong Zhang    x[1+idt] = v[1]*s1 + v[3]*s2;
106*27019359SHong Zhang   }
107*27019359SHong Zhang 
108*27019359SHong Zhang   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
109*27019359SHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
110*27019359SHong Zhang   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
111*27019359SHong Zhang   PetscFunctionReturn(0);
112*27019359SHong Zhang }
113*27019359SHong Zhang 
1144a2ae208SSatish Balay #undef __FUNCT__
1154a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
116dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
117f1af5d2fSBarry Smith {
118f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
119dfbe8321SBarry Smith   PetscErrorCode ierr;
120690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
121690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
122f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
12387828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
12487828ca2SBarry Smith   PetscScalar    *x,*b;
125f1af5d2fSBarry Smith 
126f1af5d2fSBarry Smith   PetscFunctionBegin;
127ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1281ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1291ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
130f1af5d2fSBarry Smith 
131f1af5d2fSBarry Smith   /* forward solve the U^T */
132f1af5d2fSBarry Smith   idx = 0;
133f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
134f1af5d2fSBarry Smith 
135f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
136f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
137ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
138f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
139f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
140f1af5d2fSBarry Smith     v += 4;
141f1af5d2fSBarry Smith 
142f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
143f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
144f1af5d2fSBarry Smith     while (nz--) {
145f1af5d2fSBarry Smith       oidx = 2*(*vi++);
146f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
147f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
148f1af5d2fSBarry Smith       v  += 4;
149f1af5d2fSBarry Smith     }
150f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
151f1af5d2fSBarry Smith     idx += 2;
152f1af5d2fSBarry Smith   }
153f1af5d2fSBarry Smith   /* backward solve the L^T */
154f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
155f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
156f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
157f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
158f1af5d2fSBarry Smith     idt  = 2*i;
159f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
160f1af5d2fSBarry Smith     while (nz--) {
161f1af5d2fSBarry Smith       idx   = 2*(*vi--);
162f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
163f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
164f1af5d2fSBarry Smith       v -= 4;
165f1af5d2fSBarry Smith     }
166f1af5d2fSBarry Smith   }
1671ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
169dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
170f1af5d2fSBarry Smith   PetscFunctionReturn(0);
171f1af5d2fSBarry Smith }
172f1af5d2fSBarry Smith 
1734a2ae208SSatish Balay #undef __FUNCT__
1744a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
175dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
176f1af5d2fSBarry Smith {
177f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
178dfbe8321SBarry Smith   PetscErrorCode ierr;
179690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
180690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
181f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18287828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
18387828ca2SBarry Smith   PetscScalar    *x,*b;
184f1af5d2fSBarry Smith 
185f1af5d2fSBarry Smith   PetscFunctionBegin;
186ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1871ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1881ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
189f1af5d2fSBarry Smith 
190f1af5d2fSBarry Smith   /* forward solve the U^T */
191f1af5d2fSBarry Smith   idx = 0;
192f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
193f1af5d2fSBarry Smith 
194f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
195f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
196ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
197f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
198f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
199f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
200f1af5d2fSBarry Smith     v += 9;
201f1af5d2fSBarry Smith 
202f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
203f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
204f1af5d2fSBarry Smith     while (nz--) {
205f1af5d2fSBarry Smith       oidx = 3*(*vi++);
206f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
207f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
208f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
209f1af5d2fSBarry Smith       v  += 9;
210f1af5d2fSBarry Smith     }
211f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
212f1af5d2fSBarry Smith     idx += 3;
213f1af5d2fSBarry Smith   }
214f1af5d2fSBarry Smith   /* backward solve the L^T */
215f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
216f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
217f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
218f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
219f1af5d2fSBarry Smith     idt  = 3*i;
220f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
221f1af5d2fSBarry Smith     while (nz--) {
222f1af5d2fSBarry Smith       idx   = 3*(*vi--);
223f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
224f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
225f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
226f1af5d2fSBarry Smith       v -= 9;
227f1af5d2fSBarry Smith     }
228f1af5d2fSBarry Smith   }
2291ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2301ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
231dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
232f1af5d2fSBarry Smith   PetscFunctionReturn(0);
233f1af5d2fSBarry Smith }
234f1af5d2fSBarry Smith 
2354a2ae208SSatish Balay #undef __FUNCT__
2364a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
237dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
238f1af5d2fSBarry Smith {
239f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
240dfbe8321SBarry Smith   PetscErrorCode ierr;
241690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
242690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
243f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
24487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
24587828ca2SBarry Smith   PetscScalar    *x,*b;
246f1af5d2fSBarry Smith 
247f1af5d2fSBarry Smith   PetscFunctionBegin;
248ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2491ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2501ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
251f1af5d2fSBarry Smith 
252f1af5d2fSBarry Smith   /* forward solve the U^T */
253f1af5d2fSBarry Smith   idx = 0;
254f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
255f1af5d2fSBarry Smith 
256f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
257f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
258ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
259f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
260f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
261f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
262f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
263f1af5d2fSBarry Smith     v += 16;
264f1af5d2fSBarry Smith 
265f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
266f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
267f1af5d2fSBarry Smith     while (nz--) {
268f1af5d2fSBarry Smith       oidx = 4*(*vi++);
269f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
270f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
271f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
272f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
273f1af5d2fSBarry Smith       v  += 16;
274f1af5d2fSBarry Smith     }
275f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
276f1af5d2fSBarry Smith     idx += 4;
277f1af5d2fSBarry Smith   }
278f1af5d2fSBarry Smith   /* backward solve the L^T */
279f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
280f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
281f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
282f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
283f1af5d2fSBarry Smith     idt  = 4*i;
284f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
285f1af5d2fSBarry Smith     while (nz--) {
286f1af5d2fSBarry Smith       idx   = 4*(*vi--);
287f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
288f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
289f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
290f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
291f1af5d2fSBarry Smith       v -= 16;
292f1af5d2fSBarry Smith     }
293f1af5d2fSBarry Smith   }
2941ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
297f1af5d2fSBarry Smith   PetscFunctionReturn(0);
298f1af5d2fSBarry Smith }
299f1af5d2fSBarry Smith 
3004a2ae208SSatish Balay #undef __FUNCT__
3014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
302dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
303f1af5d2fSBarry Smith {
304f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305dfbe8321SBarry Smith   PetscErrorCode ierr;
306690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
308f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
30987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
31087828ca2SBarry Smith   PetscScalar    *x,*b;
311f1af5d2fSBarry Smith 
312f1af5d2fSBarry Smith   PetscFunctionBegin;
313ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3141ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316f1af5d2fSBarry Smith 
317f1af5d2fSBarry Smith   /* forward solve the U^T */
318f1af5d2fSBarry Smith   idx = 0;
319f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
322f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
323ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
324f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
325f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
326f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
327f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
328f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
329f1af5d2fSBarry Smith     v += 25;
330f1af5d2fSBarry Smith 
331f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
332f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
333f1af5d2fSBarry Smith     while (nz--) {
334f1af5d2fSBarry Smith       oidx = 5*(*vi++);
335f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
336f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
337f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
338f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
339f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
340f1af5d2fSBarry Smith       v  += 25;
341f1af5d2fSBarry Smith     }
342f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
343f1af5d2fSBarry Smith     idx += 5;
344f1af5d2fSBarry Smith   }
345f1af5d2fSBarry Smith   /* backward solve the L^T */
346f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
347f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
348f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
349f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
350f1af5d2fSBarry Smith     idt  = 5*i;
351f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
352f1af5d2fSBarry Smith     while (nz--) {
353f1af5d2fSBarry Smith       idx   = 5*(*vi--);
354f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
355f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
356f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
357f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
358f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
359f1af5d2fSBarry Smith       v -= 25;
360f1af5d2fSBarry Smith     }
361f1af5d2fSBarry Smith   }
3621ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
364dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
365f1af5d2fSBarry Smith   PetscFunctionReturn(0);
366f1af5d2fSBarry Smith }
367f1af5d2fSBarry Smith 
3684a2ae208SSatish Balay #undef __FUNCT__
3694a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
370dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
371f1af5d2fSBarry Smith {
372f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
373dfbe8321SBarry Smith   PetscErrorCode ierr;
374690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
375690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
376f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
37787828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
37887828ca2SBarry Smith   PetscScalar    *x,*b;
379f1af5d2fSBarry Smith 
380f1af5d2fSBarry Smith   PetscFunctionBegin;
381ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3821ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
384f1af5d2fSBarry Smith 
385f1af5d2fSBarry Smith   /* forward solve the U^T */
386f1af5d2fSBarry Smith   idx = 0;
387f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
388f1af5d2fSBarry Smith 
389f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
390f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
391ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
392ef66eb69SBarry Smith     x6    = x[5+idx];
393f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
394f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
395f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
396f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
397f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
398f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
399f1af5d2fSBarry Smith     v += 36;
400f1af5d2fSBarry Smith 
401f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
402f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
403f1af5d2fSBarry Smith     while (nz--) {
404f1af5d2fSBarry Smith       oidx = 6*(*vi++);
405f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
406f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
407f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
408f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
409f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
410f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
411f1af5d2fSBarry Smith       v  += 36;
412f1af5d2fSBarry Smith     }
413f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
414f1af5d2fSBarry Smith     x[5+idx] = s6;
415f1af5d2fSBarry Smith     idx += 6;
416f1af5d2fSBarry Smith   }
417f1af5d2fSBarry Smith   /* backward solve the L^T */
418f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
419f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
420f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
421f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
422f1af5d2fSBarry Smith     idt  = 6*i;
423f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
424f1af5d2fSBarry Smith     s6 = x[5+idt];
425f1af5d2fSBarry Smith     while (nz--) {
426f1af5d2fSBarry Smith       idx   = 6*(*vi--);
427f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
428f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
429f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
430f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
431f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
432f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
433f1af5d2fSBarry Smith       v -= 36;
434f1af5d2fSBarry Smith     }
435f1af5d2fSBarry Smith   }
4361ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
438dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
439f1af5d2fSBarry Smith   PetscFunctionReturn(0);
440f1af5d2fSBarry Smith }
441f1af5d2fSBarry Smith 
4424a2ae208SSatish Balay #undef __FUNCT__
4434a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
444dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
445f1af5d2fSBarry Smith {
446f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
447dfbe8321SBarry Smith   PetscErrorCode ierr;
448690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
449690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
450f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
45187828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
45287828ca2SBarry Smith   PetscScalar    *x,*b;
453f1af5d2fSBarry Smith 
454f1af5d2fSBarry Smith   PetscFunctionBegin;
455ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4561ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4571ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
458f1af5d2fSBarry Smith 
459f1af5d2fSBarry Smith   /* forward solve the U^T */
460f1af5d2fSBarry Smith   idx = 0;
461f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
462f1af5d2fSBarry Smith 
463f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
464f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
465ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
466ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
467f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
468f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
469f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
470f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
471f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
472f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
473f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
474f1af5d2fSBarry Smith     v += 49;
475f1af5d2fSBarry Smith 
476f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
477f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
478f1af5d2fSBarry Smith     while (nz--) {
479f1af5d2fSBarry Smith       oidx = 7*(*vi++);
480f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
481f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
482f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
483f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
484f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
485f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
486f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
487f1af5d2fSBarry Smith       v  += 49;
488f1af5d2fSBarry Smith     }
489f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
490f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
491f1af5d2fSBarry Smith     idx += 7;
492f1af5d2fSBarry Smith   }
493f1af5d2fSBarry Smith   /* backward solve the L^T */
494f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
495f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
496f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
497f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
498f1af5d2fSBarry Smith     idt  = 7*i;
499f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
500f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
501f1af5d2fSBarry Smith     while (nz--) {
502f1af5d2fSBarry Smith       idx   = 7*(*vi--);
503f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
504f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
505f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
506f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
507f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
508f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
509f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
510f1af5d2fSBarry Smith       v -= 49;
511f1af5d2fSBarry Smith     }
512f1af5d2fSBarry Smith   }
5131ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
515dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
516f1af5d2fSBarry Smith   PetscFunctionReturn(0);
517f1af5d2fSBarry Smith }
518f1af5d2fSBarry Smith 
519f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
5204a2ae208SSatish Balay #undef __FUNCT__
5214a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
522dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
523f1af5d2fSBarry Smith {
524f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
525f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5266849ba73SBarry Smith   PetscErrorCode ierr;
5275d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5285d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
529690b6cddSBarry Smith   PetscInt       *diag = a->diag;
530f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
53187828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
532f1af5d2fSBarry Smith 
533f1af5d2fSBarry Smith   PetscFunctionBegin;
5341ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5351ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
536f1af5d2fSBarry Smith   t  = a->solve_work;
537f1af5d2fSBarry Smith 
538f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
539f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
540f1af5d2fSBarry Smith 
541f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
542f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
543f1af5d2fSBarry Smith     t[i] = b[c[i]];
544f1af5d2fSBarry Smith   }
545f1af5d2fSBarry Smith 
546f1af5d2fSBarry Smith   /* forward solve the U^T */
547f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
548f1af5d2fSBarry Smith 
549f1af5d2fSBarry Smith     v     = aa + diag[i];
550f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
551f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
552f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
553f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
554f1af5d2fSBarry Smith     while (nz--) {
555f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
556f1af5d2fSBarry Smith     }
557f1af5d2fSBarry Smith     t[i]   = s1;
558f1af5d2fSBarry Smith   }
559f1af5d2fSBarry Smith   /* backward solve the L^T */
560f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
561f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
562f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
563f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
564f1af5d2fSBarry Smith     s1   = t[i];
565f1af5d2fSBarry Smith     while (nz--) {
566f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
567f1af5d2fSBarry Smith     }
568f1af5d2fSBarry Smith   }
569f1af5d2fSBarry Smith 
570f1af5d2fSBarry Smith   /* copy t into x according to permutation */
571f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
572f1af5d2fSBarry Smith     x[r[i]]   = t[i];
573f1af5d2fSBarry Smith   }
574f1af5d2fSBarry Smith 
575f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
576f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5771ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5781ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
579dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
580f1af5d2fSBarry Smith   PetscFunctionReturn(0);
581f1af5d2fSBarry Smith }
582f1af5d2fSBarry Smith 
5834a2ae208SSatish Balay #undef __FUNCT__
5844a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
585dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
586f1af5d2fSBarry Smith {
587f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
588f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
5896849ba73SBarry Smith   PetscErrorCode ierr;
5905d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
5915d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
592690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
593f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
59487828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
59587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
596f1af5d2fSBarry Smith 
597f1af5d2fSBarry Smith   PetscFunctionBegin;
5981ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5991ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
600f1af5d2fSBarry Smith   t  = a->solve_work;
601f1af5d2fSBarry Smith 
602f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
603f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
604f1af5d2fSBarry Smith 
605f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
606f1af5d2fSBarry Smith   ii = 0;
607f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
608f1af5d2fSBarry Smith     ic      = 2*c[i];
609f1af5d2fSBarry Smith     t[ii]   = b[ic];
610f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
611f1af5d2fSBarry Smith     ii += 2;
612f1af5d2fSBarry Smith   }
613f1af5d2fSBarry Smith 
614f1af5d2fSBarry Smith   /* forward solve the U^T */
615f1af5d2fSBarry Smith   idx = 0;
616f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
617f1af5d2fSBarry Smith 
618f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
619f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
620f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
621f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
622f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
623f1af5d2fSBarry Smith     v += 4;
624f1af5d2fSBarry Smith 
625f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
626f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
627f1af5d2fSBarry Smith     while (nz--) {
628f1af5d2fSBarry Smith       oidx = 2*(*vi++);
629f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
630f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
631f1af5d2fSBarry Smith       v  += 4;
632f1af5d2fSBarry Smith     }
633f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
634f1af5d2fSBarry Smith     idx += 2;
635f1af5d2fSBarry Smith   }
636f1af5d2fSBarry Smith   /* backward solve the L^T */
637f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
638f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
639f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
640f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
641f1af5d2fSBarry Smith     idt  = 2*i;
642f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
643f1af5d2fSBarry Smith     while (nz--) {
644f1af5d2fSBarry Smith       idx   = 2*(*vi--);
645f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
646f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
647f1af5d2fSBarry Smith       v -= 4;
648f1af5d2fSBarry Smith     }
649f1af5d2fSBarry Smith   }
650f1af5d2fSBarry Smith 
651f1af5d2fSBarry Smith   /* copy t into x according to permutation */
652f1af5d2fSBarry Smith   ii = 0;
653f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
654f1af5d2fSBarry Smith     ir      = 2*r[i];
655f1af5d2fSBarry Smith     x[ir]   = t[ii];
656f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
657f1af5d2fSBarry Smith     ii += 2;
658f1af5d2fSBarry Smith   }
659f1af5d2fSBarry Smith 
660f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
661f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
6621ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6631ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
664dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
665f1af5d2fSBarry Smith   PetscFunctionReturn(0);
666f1af5d2fSBarry Smith }
667f1af5d2fSBarry Smith 
6684a2ae208SSatish Balay #undef __FUNCT__
6694a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
670dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
671f1af5d2fSBarry Smith {
672f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
673f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
6746849ba73SBarry Smith   PetscErrorCode ierr;
6755d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
6765d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
677690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
678f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
67987828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
68087828ca2SBarry Smith   PetscScalar    *x,*b,*t;
681f1af5d2fSBarry Smith 
682f1af5d2fSBarry Smith   PetscFunctionBegin;
6831ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6841ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
685f1af5d2fSBarry Smith   t  = a->solve_work;
686f1af5d2fSBarry Smith 
687f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
688f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
689f1af5d2fSBarry Smith 
690f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
691f1af5d2fSBarry Smith   ii = 0;
692f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
693f1af5d2fSBarry Smith     ic      = 3*c[i];
694f1af5d2fSBarry Smith     t[ii]   = b[ic];
695f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
696f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
697f1af5d2fSBarry Smith     ii += 3;
698f1af5d2fSBarry Smith   }
699f1af5d2fSBarry Smith 
700f1af5d2fSBarry Smith   /* forward solve the U^T */
701f1af5d2fSBarry Smith   idx = 0;
702f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
703f1af5d2fSBarry Smith 
704f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
705f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
706f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
707f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
708f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
709f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
710f1af5d2fSBarry Smith     v += 9;
711f1af5d2fSBarry Smith 
712f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
713f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
714f1af5d2fSBarry Smith     while (nz--) {
715f1af5d2fSBarry Smith       oidx = 3*(*vi++);
716f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
717f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
718f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
719f1af5d2fSBarry Smith       v  += 9;
720f1af5d2fSBarry Smith     }
721f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
722f1af5d2fSBarry Smith     idx += 3;
723f1af5d2fSBarry Smith   }
724f1af5d2fSBarry Smith   /* backward solve the L^T */
725f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
726f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
727f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
728f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
729f1af5d2fSBarry Smith     idt  = 3*i;
730f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
731f1af5d2fSBarry Smith     while (nz--) {
732f1af5d2fSBarry Smith       idx   = 3*(*vi--);
733f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
734f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
735f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
736f1af5d2fSBarry Smith       v -= 9;
737f1af5d2fSBarry Smith     }
738f1af5d2fSBarry Smith   }
739f1af5d2fSBarry Smith 
740f1af5d2fSBarry Smith   /* copy t into x according to permutation */
741f1af5d2fSBarry Smith   ii = 0;
742f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
743f1af5d2fSBarry Smith     ir      = 3*r[i];
744f1af5d2fSBarry Smith     x[ir]   = t[ii];
745f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
746f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
747f1af5d2fSBarry Smith     ii += 3;
748f1af5d2fSBarry Smith   }
749f1af5d2fSBarry Smith 
750f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
751f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
7521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
754dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
755f1af5d2fSBarry Smith   PetscFunctionReturn(0);
756f1af5d2fSBarry Smith }
757f1af5d2fSBarry Smith 
7584a2ae208SSatish Balay #undef __FUNCT__
7594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
760dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
761f1af5d2fSBarry Smith {
762f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
763f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
7646849ba73SBarry Smith   PetscErrorCode ierr;
7655d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
7665d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
767690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
768f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
76987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
77087828ca2SBarry Smith   PetscScalar    *x,*b,*t;
771f1af5d2fSBarry Smith 
772f1af5d2fSBarry Smith   PetscFunctionBegin;
7731ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
775f1af5d2fSBarry Smith   t  = a->solve_work;
776f1af5d2fSBarry Smith 
777f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
778f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
779f1af5d2fSBarry Smith 
780f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
781f1af5d2fSBarry Smith   ii = 0;
782f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
783f1af5d2fSBarry Smith     ic      = 4*c[i];
784f1af5d2fSBarry Smith     t[ii]   = b[ic];
785f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
786f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
787f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
788f1af5d2fSBarry Smith     ii += 4;
789f1af5d2fSBarry Smith   }
790f1af5d2fSBarry Smith 
791f1af5d2fSBarry Smith   /* forward solve the U^T */
792f1af5d2fSBarry Smith   idx = 0;
793f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
794f1af5d2fSBarry Smith 
795f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
796f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
797f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
798f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
799f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
800f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
801f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
802f1af5d2fSBarry Smith     v += 16;
803f1af5d2fSBarry Smith 
804f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
805f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
806f1af5d2fSBarry Smith     while (nz--) {
807f1af5d2fSBarry Smith       oidx = 4*(*vi++);
808f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
809f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
810f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
811f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
812f1af5d2fSBarry Smith       v  += 16;
813f1af5d2fSBarry Smith     }
814f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
815f1af5d2fSBarry Smith     idx += 4;
816f1af5d2fSBarry Smith   }
817f1af5d2fSBarry Smith   /* backward solve the L^T */
818f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
819f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
820f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
821f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
822f1af5d2fSBarry Smith     idt  = 4*i;
823f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
824f1af5d2fSBarry Smith     while (nz--) {
825f1af5d2fSBarry Smith       idx   = 4*(*vi--);
826f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
827f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
828f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
829f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
830f1af5d2fSBarry Smith       v -= 16;
831f1af5d2fSBarry Smith     }
832f1af5d2fSBarry Smith   }
833f1af5d2fSBarry Smith 
834f1af5d2fSBarry Smith   /* copy t into x according to permutation */
835f1af5d2fSBarry Smith   ii = 0;
836f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
837f1af5d2fSBarry Smith     ir      = 4*r[i];
838f1af5d2fSBarry Smith     x[ir]   = t[ii];
839f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
840f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
841f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
842f1af5d2fSBarry Smith     ii += 4;
843f1af5d2fSBarry Smith   }
844f1af5d2fSBarry Smith 
845f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
846f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
8471ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8481ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
849dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
850f1af5d2fSBarry Smith   PetscFunctionReturn(0);
851f1af5d2fSBarry Smith }
852f1af5d2fSBarry Smith 
8534a2ae208SSatish Balay #undef __FUNCT__
8544a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
855dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
856f1af5d2fSBarry Smith {
857f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
858f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8596849ba73SBarry Smith   PetscErrorCode ierr;
8605d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8615d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
862690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
863f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
86487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
86587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
866f1af5d2fSBarry Smith 
867f1af5d2fSBarry Smith   PetscFunctionBegin;
8681ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8691ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
870f1af5d2fSBarry Smith   t  = a->solve_work;
871f1af5d2fSBarry Smith 
872f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
873f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
874f1af5d2fSBarry Smith 
875f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
876f1af5d2fSBarry Smith   ii = 0;
877f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
878f1af5d2fSBarry Smith     ic      = 5*c[i];
879f1af5d2fSBarry Smith     t[ii]   = b[ic];
880f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
881f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
882f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
883f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
884f1af5d2fSBarry Smith     ii += 5;
885f1af5d2fSBarry Smith   }
886f1af5d2fSBarry Smith 
887f1af5d2fSBarry Smith   /* forward solve the U^T */
888f1af5d2fSBarry Smith   idx = 0;
889f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
890f1af5d2fSBarry Smith 
891f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
892f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
893f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
894f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
895f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
896f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
897f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
898f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
899f1af5d2fSBarry Smith     v += 25;
900f1af5d2fSBarry Smith 
901f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
902f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
903f1af5d2fSBarry Smith     while (nz--) {
904f1af5d2fSBarry Smith       oidx = 5*(*vi++);
905f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
906f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
907f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
908f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
909f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
910f1af5d2fSBarry Smith       v  += 25;
911f1af5d2fSBarry Smith     }
912f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
913f1af5d2fSBarry Smith     idx += 5;
914f1af5d2fSBarry Smith   }
915f1af5d2fSBarry Smith   /* backward solve the L^T */
916f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
917f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
918f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
919f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
920f1af5d2fSBarry Smith     idt  = 5*i;
921f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
922f1af5d2fSBarry Smith     while (nz--) {
923f1af5d2fSBarry Smith       idx   = 5*(*vi--);
924f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
925f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
926f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
927f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
928f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
929f1af5d2fSBarry Smith       v -= 25;
930f1af5d2fSBarry Smith     }
931f1af5d2fSBarry Smith   }
932f1af5d2fSBarry Smith 
933f1af5d2fSBarry Smith   /* copy t into x according to permutation */
934f1af5d2fSBarry Smith   ii = 0;
935f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
936f1af5d2fSBarry Smith     ir      = 5*r[i];
937f1af5d2fSBarry Smith     x[ir]   = t[ii];
938f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
939f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
940f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
941f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
942f1af5d2fSBarry Smith     ii += 5;
943f1af5d2fSBarry Smith   }
944f1af5d2fSBarry Smith 
945f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
946f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9471ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9481ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
949dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
950f1af5d2fSBarry Smith   PetscFunctionReturn(0);
951f1af5d2fSBarry Smith }
952f1af5d2fSBarry Smith 
9534a2ae208SSatish Balay #undef __FUNCT__
9544a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
955dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
956f1af5d2fSBarry Smith {
957f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
958f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9596849ba73SBarry Smith   PetscErrorCode ierr;
9605d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9615d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
962690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
963f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
96487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
96587828ca2SBarry Smith   PetscScalar    *x,*b,*t;
966f1af5d2fSBarry Smith 
967f1af5d2fSBarry Smith   PetscFunctionBegin;
9681ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9691ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
970f1af5d2fSBarry Smith   t  = a->solve_work;
971f1af5d2fSBarry Smith 
972f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
973f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
974f1af5d2fSBarry Smith 
975f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
976f1af5d2fSBarry Smith   ii = 0;
977f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
978f1af5d2fSBarry Smith     ic      = 6*c[i];
979f1af5d2fSBarry Smith     t[ii]   = b[ic];
980f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
981f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
982f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
983f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
984f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
985f1af5d2fSBarry Smith     ii += 6;
986f1af5d2fSBarry Smith   }
987f1af5d2fSBarry Smith 
988f1af5d2fSBarry Smith   /* forward solve the U^T */
989f1af5d2fSBarry Smith   idx = 0;
990f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
991f1af5d2fSBarry Smith 
992f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
993f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
994f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
995f1af5d2fSBarry Smith     x6    = t[5+idx];
996f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
997f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
998f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
999f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1000f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1001f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1002f1af5d2fSBarry Smith     v += 36;
1003f1af5d2fSBarry Smith 
1004f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1005f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1006f1af5d2fSBarry Smith     while (nz--) {
1007f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1008f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1009f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1010f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1011f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1012f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1013f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1014f1af5d2fSBarry Smith       v  += 36;
1015f1af5d2fSBarry Smith     }
1016f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1017f1af5d2fSBarry Smith     t[5+idx] = s6;
1018f1af5d2fSBarry Smith     idx += 6;
1019f1af5d2fSBarry Smith   }
1020f1af5d2fSBarry Smith   /* backward solve the L^T */
1021f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1022f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1023f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1024f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1025f1af5d2fSBarry Smith     idt  = 6*i;
1026f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1027f1af5d2fSBarry Smith     s6 = t[5+idt];
1028f1af5d2fSBarry Smith     while (nz--) {
1029f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1030f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1031f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1032f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1033f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1034f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1035f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1036f1af5d2fSBarry Smith       v -= 36;
1037f1af5d2fSBarry Smith     }
1038f1af5d2fSBarry Smith   }
1039f1af5d2fSBarry Smith 
1040f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1041f1af5d2fSBarry Smith   ii = 0;
1042f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1043f1af5d2fSBarry Smith     ir      = 6*r[i];
1044f1af5d2fSBarry Smith     x[ir]   = t[ii];
1045f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1046f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1047f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1048f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1049f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1050f1af5d2fSBarry Smith     ii += 6;
1051f1af5d2fSBarry Smith   }
1052f1af5d2fSBarry Smith 
1053f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1054f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10551ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10561ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1057dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1058f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1059f1af5d2fSBarry Smith }
1060f1af5d2fSBarry Smith 
10614a2ae208SSatish Balay #undef __FUNCT__
10624a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1063dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1064f1af5d2fSBarry Smith {
1065f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1066f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10676849ba73SBarry Smith   PetscErrorCode ierr;
10685d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10695d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1070690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1071f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
107287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
107387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1074f1af5d2fSBarry Smith 
1075f1af5d2fSBarry Smith   PetscFunctionBegin;
10761ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10771ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1078f1af5d2fSBarry Smith   t  = a->solve_work;
1079f1af5d2fSBarry Smith 
1080f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1081f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1082f1af5d2fSBarry Smith 
1083f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1084f1af5d2fSBarry Smith   ii = 0;
1085f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1086f1af5d2fSBarry Smith     ic      = 7*c[i];
1087f1af5d2fSBarry Smith     t[ii]   = b[ic];
1088f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1089f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1090f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1091f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1092f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1093f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1094f1af5d2fSBarry Smith     ii += 7;
1095f1af5d2fSBarry Smith   }
1096f1af5d2fSBarry Smith 
1097f1af5d2fSBarry Smith   /* forward solve the U^T */
1098f1af5d2fSBarry Smith   idx = 0;
1099f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1100f1af5d2fSBarry Smith 
1101f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1102f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1103f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1104f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1105f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1106f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1107f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1108f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1109f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1110f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1111f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1112f1af5d2fSBarry Smith     v += 49;
1113f1af5d2fSBarry Smith 
1114f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1115f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1116f1af5d2fSBarry Smith     while (nz--) {
1117f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1118f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1119f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1120f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1121f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1122f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1123f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1124f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1125f1af5d2fSBarry Smith       v  += 49;
1126f1af5d2fSBarry Smith     }
1127f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1128f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1129f1af5d2fSBarry Smith     idx += 7;
1130f1af5d2fSBarry Smith   }
1131f1af5d2fSBarry Smith   /* backward solve the L^T */
1132f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1133f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1134f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1135f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1136f1af5d2fSBarry Smith     idt  = 7*i;
1137f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1138f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1139f1af5d2fSBarry Smith     while (nz--) {
1140f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1141f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1142f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1143f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1144f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1145f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1146f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1147f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1148f1af5d2fSBarry Smith       v -= 49;
1149f1af5d2fSBarry Smith     }
1150f1af5d2fSBarry Smith   }
1151f1af5d2fSBarry Smith 
1152f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1153f1af5d2fSBarry Smith   ii = 0;
1154f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1155f1af5d2fSBarry Smith     ir      = 7*r[i];
1156f1af5d2fSBarry Smith     x[ir]   = t[ii];
1157f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1158f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1159f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1160f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1161f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1162f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1163f1af5d2fSBarry Smith     ii += 7;
1164f1af5d2fSBarry Smith   }
1165f1af5d2fSBarry Smith 
1166f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1167f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11681ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11691ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1170dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1171f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1172f1af5d2fSBarry Smith }
1173f1af5d2fSBarry Smith 
11744e2b4712SSatish Balay /* ----------------------------------------------------------- */
11754a2ae208SSatish Balay #undef __FUNCT__
11764a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1177dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
11784e2b4712SSatish Balay {
11794e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
11804e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
11816849ba73SBarry Smith   PetscErrorCode ierr;
11825d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
11835d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
11845d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
11853f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
118687828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
11874e2b4712SSatish Balay 
11884e2b4712SSatish Balay   PetscFunctionBegin;
11891ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11901ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1191f1af5d2fSBarry Smith   t  = a->solve_work;
11924e2b4712SSatish Balay 
11934e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
11944e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
11954e2b4712SSatish Balay 
11964e2b4712SSatish Balay   /* forward solve the lower triangular */
119787828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
11984e2b4712SSatish Balay   for (i=1; i<n; i++) {
11994e2b4712SSatish Balay     v   = aa + bs2*ai[i];
12004e2b4712SSatish Balay     vi  = aj + ai[i];
12014e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1202f1af5d2fSBarry Smith     s = t + bs*i;
120387828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
12044e2b4712SSatish Balay     while (nz--) {
1205f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
12064e2b4712SSatish Balay       v += bs2;
12074e2b4712SSatish Balay     }
12084e2b4712SSatish Balay   }
12094e2b4712SSatish Balay   /* backward solve the upper triangular */
1210d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
12114e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12124e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
12134e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
12144e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
121587828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
12164e2b4712SSatish Balay     while (nz--) {
1217f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
12184e2b4712SSatish Balay       v += bs2;
12194e2b4712SSatish Balay     }
1220f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
122187828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
12224e2b4712SSatish Balay   }
12234e2b4712SSatish Balay 
12244e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
12254e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12261ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12271ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1228dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
12294e2b4712SSatish Balay   PetscFunctionReturn(0);
12304e2b4712SSatish Balay }
12314e2b4712SSatish Balay 
12324a2ae208SSatish Balay #undef __FUNCT__
12334a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1234dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
12354e2b4712SSatish Balay {
12364e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
12374e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
12386849ba73SBarry Smith   PetscErrorCode ierr;
12395d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
12405d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
12413f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
124287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
124387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
12444e2b4712SSatish Balay 
12454e2b4712SSatish Balay   PetscFunctionBegin;
12461ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12471ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1248f1af5d2fSBarry Smith   t  = a->solve_work;
12494e2b4712SSatish Balay 
12504e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
12514e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
12524e2b4712SSatish Balay 
12534e2b4712SSatish Balay   /* forward solve the lower triangular */
12544e2b4712SSatish Balay   idx    = 7*(*r++);
1255f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1256f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1257f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
12584e2b4712SSatish Balay 
12594e2b4712SSatish Balay   for (i=1; i<n; i++) {
12604e2b4712SSatish Balay     v     = aa + 49*ai[i];
12614e2b4712SSatish Balay     vi    = aj + ai[i];
12624e2b4712SSatish Balay     nz    = diag[i] - ai[i];
12634e2b4712SSatish Balay     idx   = 7*(*r++);
1264f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1265f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
12664e2b4712SSatish Balay     while (nz--) {
12674e2b4712SSatish Balay       idx   = 7*(*vi++);
1268f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1269f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1270f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1271f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1272f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1273f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1274f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1275f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1276f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1277f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
12784e2b4712SSatish Balay       v += 49;
12794e2b4712SSatish Balay     }
12804e2b4712SSatish Balay     idx = 7*i;
1281f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1282f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1283f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
12844e2b4712SSatish Balay   }
12854e2b4712SSatish Balay   /* backward solve the upper triangular */
12864e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
12874e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
12884e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
12894e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
12904e2b4712SSatish Balay     idt  = 7*i;
1291f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1292f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1293f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
12944e2b4712SSatish Balay     while (nz--) {
12954e2b4712SSatish Balay       idx   = 7*(*vi++);
1296f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1297f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1298f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1299f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1300f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1301f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1302f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1303f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1304f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1305f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
13064e2b4712SSatish Balay       v += 49;
13074e2b4712SSatish Balay     }
13084e2b4712SSatish Balay     idc = 7*(*c--);
13094e2b4712SSatish Balay     v   = aa + 49*diag[i];
1310f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1311f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1312f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1313f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1314f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1315f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1316f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1317f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1318f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1319f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1320f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1321f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1322f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1323f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
13244e2b4712SSatish Balay   }
13254e2b4712SSatish Balay 
13264e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
13274e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13281ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13291ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1330dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
13314e2b4712SSatish Balay   PetscFunctionReturn(0);
13324e2b4712SSatish Balay }
13334e2b4712SSatish Balay 
13344a2ae208SSatish Balay #undef __FUNCT__
13354a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1336dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
133715091d37SBarry Smith {
133815091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1339690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1340dfbe8321SBarry Smith   PetscErrorCode    ierr;
1341690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1342d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1343d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1344d9fead3dSBarry Smith   const PetscScalar *b;
134515091d37SBarry Smith 
134615091d37SBarry Smith   PetscFunctionBegin;
1347d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13481ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
134915091d37SBarry Smith   /* forward solve the lower triangular */
135015091d37SBarry Smith   idx    = 0;
135115091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
135215091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
135315091d37SBarry Smith   x[6] = b[6+idx];
135415091d37SBarry Smith   for (i=1; i<n; i++) {
135515091d37SBarry Smith     v     =  aa + 49*ai[i];
135615091d37SBarry Smith     vi    =  aj + ai[i];
135715091d37SBarry Smith     nz    =  diag[i] - ai[i];
135815091d37SBarry Smith     idx   =  7*i;
1359f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1360f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1361f1af5d2fSBarry Smith     s7  =  b[6+idx];
136215091d37SBarry Smith     while (nz--) {
136315091d37SBarry Smith       jdx   = 7*(*vi++);
136415091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
136515091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
136615091d37SBarry Smith       x7    = x[6+jdx];
1367f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1368f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1369f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1370f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1371f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1372f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1373f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
137415091d37SBarry Smith       v += 49;
137515091d37SBarry Smith      }
1376f1af5d2fSBarry Smith     x[idx]   = s1;
1377f1af5d2fSBarry Smith     x[1+idx] = s2;
1378f1af5d2fSBarry Smith     x[2+idx] = s3;
1379f1af5d2fSBarry Smith     x[3+idx] = s4;
1380f1af5d2fSBarry Smith     x[4+idx] = s5;
1381f1af5d2fSBarry Smith     x[5+idx] = s6;
1382f1af5d2fSBarry Smith     x[6+idx] = s7;
138315091d37SBarry Smith   }
138415091d37SBarry Smith   /* backward solve the upper triangular */
138515091d37SBarry Smith   for (i=n-1; i>=0; i--){
138615091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
138715091d37SBarry Smith     vi   = aj + diag[i] + 1;
138815091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
138915091d37SBarry Smith     idt  = 7*i;
1390f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1391f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1392f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1393f1af5d2fSBarry Smith     s7 = x[6+idt];
139415091d37SBarry Smith     while (nz--) {
139515091d37SBarry Smith       idx   = 7*(*vi++);
139615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
139715091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
139815091d37SBarry Smith       x7    = x[6+idx];
1399f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1400f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1401f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1402f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1403f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1404f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1405f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
140615091d37SBarry Smith       v += 49;
140715091d37SBarry Smith     }
140815091d37SBarry Smith     v        = aa + 49*diag[i];
1409f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1410f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1411f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
1412f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
1413f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
1414f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
1415f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
1416f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
1417f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
1418f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
1419f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
1420f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
1421f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
1422f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
142315091d37SBarry Smith   }
142415091d37SBarry Smith 
1425d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14261ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1427dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
142815091d37SBarry Smith   PetscFunctionReturn(0);
142915091d37SBarry Smith }
143015091d37SBarry Smith 
14314a2ae208SSatish Balay #undef __FUNCT__
14324a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
1433dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
143415091d37SBarry Smith {
143515091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
143615091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
14376849ba73SBarry Smith   PetscErrorCode    ierr;
14385d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
14395d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1440d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1441d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
1442d9fead3dSBarry Smith   const PetscScalar *b;
144315091d37SBarry Smith   PetscFunctionBegin;
1444d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1446f1af5d2fSBarry Smith   t  = a->solve_work;
144715091d37SBarry Smith 
144815091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
144915091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
145015091d37SBarry Smith 
145115091d37SBarry Smith   /* forward solve the lower triangular */
145215091d37SBarry Smith   idx    = 6*(*r++);
1453f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1454f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
1455f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
145615091d37SBarry Smith   for (i=1; i<n; i++) {
145715091d37SBarry Smith     v     = aa + 36*ai[i];
145815091d37SBarry Smith     vi    = aj + ai[i];
145915091d37SBarry Smith     nz    = diag[i] - ai[i];
146015091d37SBarry Smith     idx   = 6*(*r++);
1461f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1462f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
146315091d37SBarry Smith     while (nz--) {
146415091d37SBarry Smith       idx   = 6*(*vi++);
1465f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
1466f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
1467f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1468f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1469f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1470f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1471f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1472f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
147315091d37SBarry Smith       v += 36;
147415091d37SBarry Smith     }
147515091d37SBarry Smith     idx = 6*i;
1476f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1477f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
1478f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
147915091d37SBarry Smith   }
148015091d37SBarry Smith   /* backward solve the upper triangular */
148115091d37SBarry Smith   for (i=n-1; i>=0; i--){
148215091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
148315091d37SBarry Smith     vi   = aj + diag[i] + 1;
148415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
148515091d37SBarry Smith     idt  = 6*i;
1486f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1487f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
1488f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
148915091d37SBarry Smith     while (nz--) {
149015091d37SBarry Smith       idx   = 6*(*vi++);
1491f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1492f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1493f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
1494f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1495f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1496f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1497f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1498f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1499f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
150015091d37SBarry Smith       v += 36;
150115091d37SBarry Smith     }
150215091d37SBarry Smith     idc = 6*(*c--);
150315091d37SBarry Smith     v   = aa + 36*diag[i];
1504f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
1505f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
1506f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
1507f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
1508f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
1509f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
1510f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
1511f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
1512f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
1513f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
1514f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
1515f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
151615091d37SBarry Smith   }
151715091d37SBarry Smith 
151815091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
151915091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1520d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15211ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1522dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
152315091d37SBarry Smith   PetscFunctionReturn(0);
152415091d37SBarry Smith }
152515091d37SBarry Smith 
15264a2ae208SSatish Balay #undef __FUNCT__
15274a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
1528dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
152915091d37SBarry Smith {
153015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1531690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1532dfbe8321SBarry Smith   PetscErrorCode    ierr;
1533690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1534d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1535d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
1536d9fead3dSBarry Smith   const PetscScalar *b;
153715091d37SBarry Smith 
153815091d37SBarry Smith   PetscFunctionBegin;
1539d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
154115091d37SBarry Smith   /* forward solve the lower triangular */
154215091d37SBarry Smith   idx    = 0;
154315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
154415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
154515091d37SBarry Smith   for (i=1; i<n; i++) {
154615091d37SBarry Smith     v     =  aa + 36*ai[i];
154715091d37SBarry Smith     vi    =  aj + ai[i];
154815091d37SBarry Smith     nz    =  diag[i] - ai[i];
154915091d37SBarry Smith     idx   =  6*i;
1550f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1551f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
155215091d37SBarry Smith     while (nz--) {
155315091d37SBarry Smith       jdx   = 6*(*vi++);
155415091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
155515091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
1556f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1557f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1558f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1559f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1560f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1561f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
156215091d37SBarry Smith       v += 36;
156315091d37SBarry Smith      }
1564f1af5d2fSBarry Smith     x[idx]   = s1;
1565f1af5d2fSBarry Smith     x[1+idx] = s2;
1566f1af5d2fSBarry Smith     x[2+idx] = s3;
1567f1af5d2fSBarry Smith     x[3+idx] = s4;
1568f1af5d2fSBarry Smith     x[4+idx] = s5;
1569f1af5d2fSBarry Smith     x[5+idx] = s6;
157015091d37SBarry Smith   }
157115091d37SBarry Smith   /* backward solve the upper triangular */
157215091d37SBarry Smith   for (i=n-1; i>=0; i--){
157315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
157415091d37SBarry Smith     vi   = aj + diag[i] + 1;
157515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
157615091d37SBarry Smith     idt  = 6*i;
1577f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1578f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1579f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
158015091d37SBarry Smith     while (nz--) {
158115091d37SBarry Smith       idx   = 6*(*vi++);
158215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
158315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
1584f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1585f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1586f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1587f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1588f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1589f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
159015091d37SBarry Smith       v += 36;
159115091d37SBarry Smith     }
159215091d37SBarry Smith     v        = aa + 36*diag[i];
1593f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
1594f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
1595f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
1596f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
1597f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
1598f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
159915091d37SBarry Smith   }
160015091d37SBarry Smith 
1601d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1603dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
160415091d37SBarry Smith   PetscFunctionReturn(0);
160515091d37SBarry Smith }
160615091d37SBarry Smith 
16074a2ae208SSatish Balay #undef __FUNCT__
16084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
1609dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
16104e2b4712SSatish Balay {
16114e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
16124e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
16136849ba73SBarry Smith   PetscErrorCode    ierr;
16145d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
16155d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
1616d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1617d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
1618d9fead3dSBarry Smith   const PetscScalar *b;
16194e2b4712SSatish Balay 
16204e2b4712SSatish Balay   PetscFunctionBegin;
1621d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1623f1af5d2fSBarry Smith   t  = a->solve_work;
16244e2b4712SSatish Balay 
16254e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
16264e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
16274e2b4712SSatish Balay 
16284e2b4712SSatish Balay   /* forward solve the lower triangular */
16294e2b4712SSatish Balay   idx    = 5*(*r++);
1630f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1631f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
16324e2b4712SSatish Balay   for (i=1; i<n; i++) {
16334e2b4712SSatish Balay     v     = aa + 25*ai[i];
16344e2b4712SSatish Balay     vi    = aj + ai[i];
16354e2b4712SSatish Balay     nz    = diag[i] - ai[i];
16364e2b4712SSatish Balay     idx   = 5*(*r++);
1637f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1638f1af5d2fSBarry Smith     s5  = b[4+idx];
16394e2b4712SSatish Balay     while (nz--) {
16404e2b4712SSatish Balay       idx   = 5*(*vi++);
1641f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1642f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1643f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1644f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1645f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1646f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1647f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
16484e2b4712SSatish Balay       v += 25;
16494e2b4712SSatish Balay     }
16504e2b4712SSatish Balay     idx = 5*i;
1651f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1652f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
16534e2b4712SSatish Balay   }
16544e2b4712SSatish Balay   /* backward solve the upper triangular */
16554e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
16564e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
16574e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
16584e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
16594e2b4712SSatish Balay     idt  = 5*i;
1660f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1661f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
16624e2b4712SSatish Balay     while (nz--) {
16634e2b4712SSatish Balay       idx   = 5*(*vi++);
1664f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1665f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1666f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1667f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1668f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1669f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1670f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
16714e2b4712SSatish Balay       v += 25;
16724e2b4712SSatish Balay     }
16734e2b4712SSatish Balay     idc = 5*(*c--);
16744e2b4712SSatish Balay     v   = aa + 25*diag[i];
1675f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
1676f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
1677f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
1678f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
1679f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
1680f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
1681f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
1682f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
1683f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
1684f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
16854e2b4712SSatish Balay   }
16864e2b4712SSatish Balay 
16874e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
16884e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1689d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16901ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1691dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
16924e2b4712SSatish Balay   PetscFunctionReturn(0);
16934e2b4712SSatish Balay }
16944e2b4712SSatish Balay 
169584a281e5SHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
169684a281e5SHong Zhang {
169784a281e5SHong Zhang   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
169884a281e5SHong Zhang   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
169984a281e5SHong Zhang   PetscErrorCode    ierr;
170084a281e5SHong Zhang   PetscInt          jdx;
170184a281e5SHong Zhang   const MatScalar   *aa=a->a,*v;
170284a281e5SHong Zhang   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
170384a281e5SHong Zhang   const PetscScalar *b;
170484a281e5SHong Zhang 
170584a281e5SHong Zhang   PetscFunctionBegin;
170684a281e5SHong Zhang   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
170784a281e5SHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
170884a281e5SHong Zhang   /* forward solve the lower triangular */
170984a281e5SHong Zhang   idx    = 0;
171084a281e5SHong Zhang   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
171184a281e5SHong Zhang   for (i=1; i<n; i++) {
171284a281e5SHong Zhang     v   = aa + 25*ai[i];
171384a281e5SHong Zhang     vi  = aj + ai[i];
171484a281e5SHong Zhang     nz  = ai[i+1] - ai[i];
171584a281e5SHong Zhang     idx = 5*i;
171684a281e5SHong Zhang     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
171784a281e5SHong Zhang     while (nz--) {
171884a281e5SHong Zhang       jdx   = 5*(*vi++);
171984a281e5SHong Zhang       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
172084a281e5SHong Zhang       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
172184a281e5SHong Zhang       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
172284a281e5SHong Zhang       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
172384a281e5SHong Zhang       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
172484a281e5SHong Zhang       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
172584a281e5SHong Zhang       v    += 25;
172684a281e5SHong Zhang     }
172784a281e5SHong Zhang     x[idx]   = s1;
172884a281e5SHong Zhang     x[1+idx] = s2;
172984a281e5SHong Zhang     x[2+idx] = s3;
173084a281e5SHong Zhang     x[3+idx] = s4;
173184a281e5SHong Zhang     x[4+idx] = s5;
173284a281e5SHong Zhang   }
173384a281e5SHong Zhang 
173484a281e5SHong Zhang   /* backward solve the upper triangular */
173584a281e5SHong Zhang   for (i=n-1; i>=0; i--){
173684a281e5SHong Zhang     v   = aa + 25*ai[2*n-i];
173784a281e5SHong Zhang     vi  = aj + ai[2*n-i];
173884a281e5SHong Zhang     nz  = ai[2*n-i +1] - ai[2*n-i]-1;
173984a281e5SHong Zhang     idt = 5*i;
174084a281e5SHong Zhang     s1 = x[idt];  s2 = x[1+idt];
174184a281e5SHong Zhang     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
174284a281e5SHong Zhang     while (nz--) {
174384a281e5SHong Zhang       idx   = 5*(*vi++);
174484a281e5SHong Zhang       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
174584a281e5SHong Zhang       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
174684a281e5SHong Zhang       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
174784a281e5SHong Zhang       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
174884a281e5SHong Zhang       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
174984a281e5SHong Zhang       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
175084a281e5SHong Zhang       v    += 25;
175184a281e5SHong Zhang     }
175284a281e5SHong Zhang     /* x = inv_diagonal*x */
175384a281e5SHong Zhang     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
175484a281e5SHong Zhang     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
175584a281e5SHong Zhang     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
175684a281e5SHong Zhang     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
175784a281e5SHong Zhang     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
175884a281e5SHong Zhang   }
175984a281e5SHong Zhang 
176084a281e5SHong Zhang   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
176184a281e5SHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
176284a281e5SHong Zhang   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
176384a281e5SHong Zhang   PetscFunctionReturn(0);
176484a281e5SHong Zhang }
176584a281e5SHong Zhang 
17664a2ae208SSatish Balay #undef __FUNCT__
17674a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
1768dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
176915091d37SBarry Smith {
177015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1771690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1772dfbe8321SBarry Smith   PetscErrorCode    ierr;
1773690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1774d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1775d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
1776d9fead3dSBarry Smith   const PetscScalar *b;
177715091d37SBarry Smith 
177815091d37SBarry Smith   PetscFunctionBegin;
1779d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
178115091d37SBarry Smith   /* forward solve the lower triangular */
178215091d37SBarry Smith   idx    = 0;
178315091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
178415091d37SBarry Smith   for (i=1; i<n; i++) {
178515091d37SBarry Smith     v     =  aa + 25*ai[i];
178615091d37SBarry Smith     vi    =  aj + ai[i];
178715091d37SBarry Smith     nz    =  diag[i] - ai[i];
178815091d37SBarry Smith     idx   =  5*i;
1789f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
179015091d37SBarry Smith     while (nz--) {
179115091d37SBarry Smith       jdx   = 5*(*vi++);
179215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
1793f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1794f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1795f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1796f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1797f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
179815091d37SBarry Smith       v    += 25;
179915091d37SBarry Smith     }
1800f1af5d2fSBarry Smith     x[idx]   = s1;
1801f1af5d2fSBarry Smith     x[1+idx] = s2;
1802f1af5d2fSBarry Smith     x[2+idx] = s3;
1803f1af5d2fSBarry Smith     x[3+idx] = s4;
1804f1af5d2fSBarry Smith     x[4+idx] = s5;
180515091d37SBarry Smith   }
180615091d37SBarry Smith   /* backward solve the upper triangular */
180715091d37SBarry Smith   for (i=n-1; i>=0; i--){
180815091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
180915091d37SBarry Smith     vi   = aj + diag[i] + 1;
181015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
181115091d37SBarry Smith     idt  = 5*i;
1812f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
1813f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
181415091d37SBarry Smith     while (nz--) {
181515091d37SBarry Smith       idx   = 5*(*vi++);
181615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
1817f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
1818f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
1819f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
1820f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
1821f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
182215091d37SBarry Smith       v    += 25;
182315091d37SBarry Smith     }
182415091d37SBarry Smith     v        = aa + 25*diag[i];
1825f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
1826f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
1827f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
1828f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
1829f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
183015091d37SBarry Smith   }
183115091d37SBarry Smith 
1832d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1834dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
183515091d37SBarry Smith   PetscFunctionReturn(0);
183615091d37SBarry Smith }
183715091d37SBarry Smith 
18384a2ae208SSatish Balay #undef __FUNCT__
18394a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
1840dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
18414e2b4712SSatish Balay {
18424e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
18434e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
18446849ba73SBarry Smith   PetscErrorCode    ierr;
18455d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
18465d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1847d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1848d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
1849d9fead3dSBarry Smith   const PetscScalar *b;
18504e2b4712SSatish Balay 
18514e2b4712SSatish Balay   PetscFunctionBegin;
1852d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1854f1af5d2fSBarry Smith   t  = a->solve_work;
18554e2b4712SSatish Balay 
18564e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
18574e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
18584e2b4712SSatish Balay 
18594e2b4712SSatish Balay   /* forward solve the lower triangular */
18604e2b4712SSatish Balay   idx    = 4*(*r++);
1861f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1862f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
18634e2b4712SSatish Balay   for (i=1; i<n; i++) {
18644e2b4712SSatish Balay     v     = aa + 16*ai[i];
18654e2b4712SSatish Balay     vi    = aj + ai[i];
18664e2b4712SSatish Balay     nz    = diag[i] - ai[i];
18674e2b4712SSatish Balay     idx   = 4*(*r++);
1868f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
18694e2b4712SSatish Balay     while (nz--) {
18704e2b4712SSatish Balay       idx   = 4*(*vi++);
1871f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
1872f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1873f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1874f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1875f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
18764e2b4712SSatish Balay       v    += 16;
18774e2b4712SSatish Balay     }
18784e2b4712SSatish Balay     idx        = 4*i;
1879f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1880f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
18814e2b4712SSatish Balay   }
18824e2b4712SSatish Balay   /* backward solve the upper triangular */
18834e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
18844e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
18854e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
18864e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
18874e2b4712SSatish Balay     idt  = 4*i;
1888f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1889f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
18904e2b4712SSatish Balay     while (nz--) {
18914e2b4712SSatish Balay       idx   = 4*(*vi++);
1892f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1893f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
1894f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1895f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1896f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1897f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
18984e2b4712SSatish Balay       v += 16;
18994e2b4712SSatish Balay     }
19004e2b4712SSatish Balay     idc      = 4*(*c--);
19014e2b4712SSatish Balay     v        = aa + 16*diag[i];
1902f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1903f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1904f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1905f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
19064e2b4712SSatish Balay   }
19074e2b4712SSatish Balay 
19084e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
19094e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1910d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1912dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
19134e2b4712SSatish Balay   PetscFunctionReturn(0);
19144e2b4712SSatish Balay }
1915f26ec98cSKris Buschelman 
1916f26ec98cSKris Buschelman #undef __FUNCT__
1917f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
1918dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
1919f26ec98cSKris Buschelman {
1920f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1921f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
19226849ba73SBarry Smith   PetscErrorCode    ierr;
19235d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
19245d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
1925d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1926d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
1927d9fead3dSBarry Smith   PetscScalar       *x;
1928d9fead3dSBarry Smith   const PetscScalar *b;
1929f26ec98cSKris Buschelman 
1930f26ec98cSKris Buschelman   PetscFunctionBegin;
1931d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1933f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
1934f26ec98cSKris Buschelman 
1935f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1936f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
1937f26ec98cSKris Buschelman 
1938f26ec98cSKris Buschelman   /* forward solve the lower triangular */
1939f26ec98cSKris Buschelman   idx    = 4*(*r++);
1940f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
1941f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
1942f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
1943f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
1944f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
1945f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
1946f26ec98cSKris Buschelman     vi    = aj + ai[i];
1947f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
1948f26ec98cSKris Buschelman     idx   = 4*(*r++);
1949f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
1950f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
1951f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
1952f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
1953f26ec98cSKris Buschelman     while (nz--) {
1954f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1955f26ec98cSKris Buschelman       x1  = t[idx];
1956f26ec98cSKris Buschelman       x2  = t[1+idx];
1957f26ec98cSKris Buschelman       x3  = t[2+idx];
1958f26ec98cSKris Buschelman       x4  = t[3+idx];
1959f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
1960f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
1961f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1962f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1963f26ec98cSKris Buschelman       v    += 16;
1964f26ec98cSKris Buschelman     }
1965f26ec98cSKris Buschelman     idx        = 4*i;
1966f26ec98cSKris Buschelman     t[idx]   = s1;
1967f26ec98cSKris Buschelman     t[1+idx] = s2;
1968f26ec98cSKris Buschelman     t[2+idx] = s3;
1969f26ec98cSKris Buschelman     t[3+idx] = s4;
1970f26ec98cSKris Buschelman   }
1971f26ec98cSKris Buschelman   /* backward solve the upper triangular */
1972f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
1973f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
1974f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
1975f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
1976f26ec98cSKris Buschelman     idt  = 4*i;
1977f26ec98cSKris Buschelman     s1 = t[idt];
1978f26ec98cSKris Buschelman     s2 = t[1+idt];
1979f26ec98cSKris Buschelman     s3 = t[2+idt];
1980f26ec98cSKris Buschelman     s4 = t[3+idt];
1981f26ec98cSKris Buschelman     while (nz--) {
1982f26ec98cSKris Buschelman       idx   = 4*(*vi++);
1983f26ec98cSKris Buschelman       x1  = t[idx];
1984f26ec98cSKris Buschelman       x2  = t[1+idx];
1985f26ec98cSKris Buschelman       x3  = t[2+idx];
1986f26ec98cSKris Buschelman       x4  = t[3+idx];
1987f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
1988f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
1989f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
1990f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
1991f26ec98cSKris Buschelman       v += 16;
1992f26ec98cSKris Buschelman     }
1993f26ec98cSKris Buschelman     idc      = 4*(*c--);
1994f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
1995f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
1996f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
1997f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
1998f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
1999f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
2000f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
2001f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
2002f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
2003f26ec98cSKris Buschelman  }
2004f26ec98cSKris Buschelman 
2005f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
2006f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2007d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20081ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2009dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2010f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2011f26ec98cSKris Buschelman }
2012f26ec98cSKris Buschelman 
201324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
201424c233c2SKris Buschelman 
201524c233c2SKris Buschelman #include PETSC_HAVE_SSE
201624c233c2SKris Buschelman 
201724c233c2SKris Buschelman #undef __FUNCT__
201824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
2019dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
202024c233c2SKris Buschelman {
202124c233c2SKris Buschelman   /*
202224c233c2SKris Buschelman      Note: This code uses demotion of double
202324c233c2SKris Buschelman      to float when performing the mixed-mode computation.
202424c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
202524c233c2SKris Buschelman   */
202624c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
202724c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
20286849ba73SBarry Smith   PetscErrorCode ierr;
20295d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
20305d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
203124c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
203287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
203324c233c2SKris Buschelman 
203424c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
203524c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
203624c233c2SKris Buschelman   unsigned long   offset;
203724c233c2SKris Buschelman 
203824c233c2SKris Buschelman   PetscFunctionBegin;
203924c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
204024c233c2SKris Buschelman 
204124c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
204224c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
204324c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
204424c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
204524c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
204624c233c2SKris Buschelman 
20471ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
20481ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
204924c233c2SKris Buschelman     t  = a->solve_work;
205024c233c2SKris Buschelman 
205124c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
205224c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
205324c233c2SKris Buschelman 
205424c233c2SKris Buschelman     /* forward solve the lower triangular */
205524c233c2SKris Buschelman     idx  = 4*(*r++);
205624c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
205724c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
205824c233c2SKris Buschelman     v    =  aa + 16*ai[1];
205924c233c2SKris Buschelman 
206024c233c2SKris Buschelman     for (i=1; i<n;) {
206124c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
206224c233c2SKris Buschelman       vi   =  aj      + ai[i];
206324c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
206424c233c2SKris Buschelman       idx  =  4*(*r++);
206524c233c2SKris Buschelman 
206624c233c2SKris Buschelman       /* Demote sum from double to float */
206724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
206824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
206924c233c2SKris Buschelman 
207024c233c2SKris Buschelman       while (nz--) {
207124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
207224c233c2SKris Buschelman         idx = 4*(*vi++);
207324c233c2SKris Buschelman 
207424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
207524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
207624c233c2SKris Buschelman 
207724c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
207824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
207924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
208024c233c2SKris Buschelman 
208124c233c2SKris Buschelman           /* First Column */
208224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
208324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
208424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
208524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
208624c233c2SKris Buschelman 
208724c233c2SKris Buschelman           /* Second Column */
208824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
208924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
209024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
209124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
209224c233c2SKris Buschelman 
209324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
209424c233c2SKris Buschelman 
209524c233c2SKris Buschelman           /* Third Column */
209624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
209724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
209824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
209924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
210024c233c2SKris Buschelman 
210124c233c2SKris Buschelman           /* Fourth Column */
210224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
210324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
210424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
210524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
210624c233c2SKris Buschelman         SSE_INLINE_END_2
210724c233c2SKris Buschelman 
210824c233c2SKris Buschelman         v  += 16;
210924c233c2SKris Buschelman       }
211024c233c2SKris Buschelman       idx = 4*i;
211124c233c2SKris Buschelman       v   = aa + 16*ai[++i];
211224c233c2SKris Buschelman       PREFETCH_NTA(v);
211324c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
211424c233c2SKris Buschelman 
211524c233c2SKris Buschelman       /* Promote result from float to double */
211624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
211724c233c2SKris Buschelman     }
211824c233c2SKris Buschelman     /* backward solve the upper triangular */
211924c233c2SKris Buschelman     idt  = 4*(n-1);
212024c233c2SKris Buschelman     ai16 = 16*diag[n-1];
212124c233c2SKris Buschelman     v    = aa + ai16 + 16;
212224c233c2SKris Buschelman     for (i=n-1; i>=0;){
212324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
212424c233c2SKris Buschelman       vi = aj + diag[i] + 1;
212524c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
212624c233c2SKris Buschelman 
212724c233c2SKris Buschelman       /* Demote accumulator from double to float */
212824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
212924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
213024c233c2SKris Buschelman 
213124c233c2SKris Buschelman       while (nz--) {
213224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
213324c233c2SKris Buschelman         idx = 4*(*vi++);
213424c233c2SKris Buschelman 
213524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
213624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
213724c233c2SKris Buschelman 
213824c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
213924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
214024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
214124c233c2SKris Buschelman 
214224c233c2SKris Buschelman           /* First Column */
214324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
214424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
214524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
214624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
214724c233c2SKris Buschelman 
214824c233c2SKris Buschelman           /* Second Column */
214924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
215024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
215124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
215224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
215324c233c2SKris Buschelman 
215424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
215524c233c2SKris Buschelman 
215624c233c2SKris Buschelman           /* Third Column */
215724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
215824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
215924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
216024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
216124c233c2SKris Buschelman 
216224c233c2SKris Buschelman           /* Fourth Column */
216324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
216424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
216524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
216624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
216724c233c2SKris Buschelman         SSE_INLINE_END_2
216824c233c2SKris Buschelman         v  += 16;
216924c233c2SKris Buschelman       }
217024c233c2SKris Buschelman       v    = aa + ai16;
217124c233c2SKris Buschelman       ai16 = 16*diag[--i];
217224c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
217324c233c2SKris Buschelman       /*
217424c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
217524c233c2SKris Buschelman          which was inverted as part of the factorization
217624c233c2SKris Buschelman       */
217724c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
217824c233c2SKris Buschelman         /* First Column */
217924c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
218024c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
218124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
218224c233c2SKris Buschelman 
218324c233c2SKris Buschelman         /* Second Column */
218424c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
218524c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
218624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
218724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
218824c233c2SKris Buschelman 
218924c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
219024c233c2SKris Buschelman 
219124c233c2SKris Buschelman         /* Third Column */
219224c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
219324c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
219424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
219524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
219624c233c2SKris Buschelman 
219724c233c2SKris Buschelman         /* Fourth Column */
219824c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
219924c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
220024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
220124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
220224c233c2SKris Buschelman 
220324c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
220424c233c2SKris Buschelman       SSE_INLINE_END_3
220524c233c2SKris Buschelman 
220624c233c2SKris Buschelman       /* Promote solution from float to double */
220724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
220824c233c2SKris Buschelman 
220924c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
221024c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
221124c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
221224c233c2SKris Buschelman       idc  = 4*(*c--);
221324c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
221424c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
221524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
221624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
221724c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
221824c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
221924c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
222024c233c2SKris Buschelman       SSE_INLINE_END_2
222124c233c2SKris Buschelman       v    = aa + ai16 + 16;
222224c233c2SKris Buschelman       idt -= 4;
222324c233c2SKris Buschelman     }
222424c233c2SKris Buschelman 
222524c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
222624c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22271ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
22281ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2229dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
223024c233c2SKris Buschelman   SSE_SCOPE_END;
223124c233c2SKris Buschelman   PetscFunctionReturn(0);
223224c233c2SKris Buschelman }
223324c233c2SKris Buschelman 
223424c233c2SKris Buschelman #endif
22350ef38995SBarry Smith 
22360ef38995SBarry Smith 
22374e2b4712SSatish Balay /*
22384e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
22394e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
22404e2b4712SSatish Balay */
22414a2ae208SSatish Balay #undef __FUNCT__
22424a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
2243dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
22444e2b4712SSatish Balay {
22454e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2246356650c2SBarry Smith   PetscInt          n=a->mbs;
2247356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
2248dfbe8321SBarry Smith   PetscErrorCode    ierr;
2249356650c2SBarry Smith   const PetscInt    *diag = a->diag;
2250d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
2251d9fead3dSBarry Smith   PetscScalar       *x;
2252d9fead3dSBarry Smith   const PetscScalar *b;
22534e2b4712SSatish Balay 
22544e2b4712SSatish Balay   PetscFunctionBegin;
2255d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22561ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22574e2b4712SSatish Balay 
2258aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
22592853dc0eSBarry Smith   {
226087828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
22612853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
22622853dc0eSBarry Smith   }
2263aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
22642853dc0eSBarry Smith   {
226587828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
22662853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
22672853dc0eSBarry Smith   }
2268aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
22692853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
2270e1293385SBarry Smith #else
227130d4dcafSBarry Smith   {
227287828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
2273d9fead3dSBarry Smith     const MatScalar *v;
2274356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
2275356650c2SBarry Smith     const PetscInt  *vi;
2276e1293385SBarry Smith 
22774e2b4712SSatish Balay   /* forward solve the lower triangular */
22784e2b4712SSatish Balay   idx    = 0;
2279e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
22804e2b4712SSatish Balay   for (i=1; i<n; i++) {
22814e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
22824e2b4712SSatish Balay     vi    =  aj      + ai[i];
22834e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
2284e1293385SBarry Smith     idx   +=  4;
2285f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
22864e2b4712SSatish Balay     while (nz--) {
22874e2b4712SSatish Balay       jdx   = 4*(*vi++);
22884e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
2289f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2290f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2291f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2292f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
22934e2b4712SSatish Balay       v    += 16;
22944e2b4712SSatish Balay     }
2295f1af5d2fSBarry Smith     x[idx]   = s1;
2296f1af5d2fSBarry Smith     x[1+idx] = s2;
2297f1af5d2fSBarry Smith     x[2+idx] = s3;
2298f1af5d2fSBarry Smith     x[3+idx] = s4;
22994e2b4712SSatish Balay   }
23004e2b4712SSatish Balay   /* backward solve the upper triangular */
23014e555682SBarry Smith   idt = 4*(n-1);
23024e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
23034e555682SBarry Smith     ai16 = 16*diag[i];
23044e555682SBarry Smith     v    = aa + ai16 + 16;
23054e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
23064e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
2307f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2308f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
23094e2b4712SSatish Balay     while (nz--) {
23104e2b4712SSatish Balay       idx   = 4*(*vi++);
23114e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
2312f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2313f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2314f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2315f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
23164e2b4712SSatish Balay       v    += 16;
23174e2b4712SSatish Balay     }
23184e555682SBarry Smith     v        = aa + ai16;
2319f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
2320f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
2321f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
2322f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
2323329f5518SBarry Smith     idt -= 4;
23244e2b4712SSatish Balay   }
232530d4dcafSBarry Smith   }
2326e1293385SBarry Smith #endif
23274e2b4712SSatish Balay 
2328d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23291ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2330dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
23314e2b4712SSatish Balay   PetscFunctionReturn(0);
23324e2b4712SSatish Balay }
23334e2b4712SSatish Balay 
2334f26ec98cSKris Buschelman #undef __FUNCT__
2335f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
2336dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
2337f26ec98cSKris Buschelman {
2338f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
2339690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
2340dfbe8321SBarry Smith   PetscErrorCode ierr;
2341690b6cddSBarry Smith   PetscInt       *diag = a->diag;
2342f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
2343f26ec98cSKris Buschelman   PetscScalar    *x,*b;
2344f26ec98cSKris Buschelman 
2345f26ec98cSKris Buschelman   PetscFunctionBegin;
23461ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
23471ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2348f26ec98cSKris Buschelman 
2349f26ec98cSKris Buschelman   {
2350f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
2351f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
2352690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
2353f26ec98cSKris Buschelman 
2354f26ec98cSKris Buschelman     /* forward solve the lower triangular */
2355f26ec98cSKris Buschelman     idx  = 0;
2356f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
2357f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
2358f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
2359f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
2360f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
2361f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
2362f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
2363f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
2364f26ec98cSKris Buschelman       idx   +=  4;
2365f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
2366f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
2367f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
2368f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
2369f26ec98cSKris Buschelman       while (nz--) {
2370f26ec98cSKris Buschelman         jdx = 4*(*vi++);
2371f26ec98cSKris Buschelman         x1  = t[jdx];
2372f26ec98cSKris Buschelman         x2  = t[1+jdx];
2373f26ec98cSKris Buschelman         x3  = t[2+jdx];
2374f26ec98cSKris Buschelman         x4  = t[3+jdx];
2375f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2376f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2377f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2378f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2379f26ec98cSKris Buschelman         v    += 16;
2380f26ec98cSKris Buschelman       }
2381f26ec98cSKris Buschelman       t[idx]   = s1;
2382f26ec98cSKris Buschelman       t[1+idx] = s2;
2383f26ec98cSKris Buschelman       t[2+idx] = s3;
2384f26ec98cSKris Buschelman       t[3+idx] = s4;
2385f26ec98cSKris Buschelman     }
2386f26ec98cSKris Buschelman     /* backward solve the upper triangular */
2387f26ec98cSKris Buschelman     idt = 4*(n-1);
2388f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
2389f26ec98cSKris Buschelman       ai16 = 16*diag[i];
2390f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
2391f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
2392f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
2393f26ec98cSKris Buschelman       s1   = t[idt];
2394f26ec98cSKris Buschelman       s2   = t[1+idt];
2395f26ec98cSKris Buschelman       s3   = t[2+idt];
2396f26ec98cSKris Buschelman       s4   = t[3+idt];
2397f26ec98cSKris Buschelman       while (nz--) {
2398f26ec98cSKris Buschelman         idx = 4*(*vi++);
2399f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
2400f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
2401f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
2402f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
2403f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2404f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2405f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2406f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2407f26ec98cSKris Buschelman         v    += 16;
2408f26ec98cSKris Buschelman       }
2409f26ec98cSKris Buschelman       v        = aa + ai16;
2410f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
2411f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
2412f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
2413f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
2414f26ec98cSKris Buschelman       idt -= 4;
2415f26ec98cSKris Buschelman     }
2416f26ec98cSKris Buschelman   }
2417f26ec98cSKris Buschelman 
24181ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
24191ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2420dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
2421f26ec98cSKris Buschelman   PetscFunctionReturn(0);
2422f26ec98cSKris Buschelman }
2423f26ec98cSKris Buschelman 
24243660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
24253660e330SKris Buschelman 
24263660e330SKris Buschelman #include PETSC_HAVE_SSE
24273660e330SKris Buschelman #undef __FUNCT__
24287cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
2429dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
24303660e330SKris Buschelman {
24313660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
24322aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
2433dfbe8321SBarry Smith   PetscErrorCode ierr;
2434dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
24353660e330SKris Buschelman   MatScalar      *aa=a->a;
243687828ca2SBarry Smith   PetscScalar    *x,*b;
24373660e330SKris Buschelman 
24383660e330SKris Buschelman   PetscFunctionBegin;
24393660e330SKris Buschelman   SSE_SCOPE_BEGIN;
24403660e330SKris Buschelman   /*
24413660e330SKris Buschelman      Note: This code currently uses demotion of double
24423660e330SKris Buschelman      to float when performing the mixed-mode computation.
24433660e330SKris Buschelman      This may not be numerically reasonable for all applications.
24443660e330SKris Buschelman   */
24453660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
24463660e330SKris Buschelman 
24471ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
24481ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
24493660e330SKris Buschelman   {
2450eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
2451eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
24522aa5897fSKris Buschelman     int            nz,i,idt,ai16;
24532aa5897fSKris Buschelman     unsigned int   jdx,idx;
24542aa5897fSKris Buschelman     unsigned short *vi;
2455eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
24563660e330SKris Buschelman 
2457eb05f457SKris Buschelman     /* First block is the identity. */
24583660e330SKris Buschelman     idx  = 0;
2459eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
24602aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
24613660e330SKris Buschelman 
24623660e330SKris Buschelman     for (i=1; i<n;) {
24633660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
24643660e330SKris Buschelman       vi   =  aj      + ai[i];
24653660e330SKris Buschelman       nz   =  diag[i] - ai[i];
24663660e330SKris Buschelman       idx +=  4;
24673660e330SKris Buschelman 
2468eb05f457SKris Buschelman       /* Demote RHS from double to float. */
2469eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
2470eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
24713660e330SKris Buschelman 
24723660e330SKris Buschelman       while (nz--) {
24733660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
24742aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
24753660e330SKris Buschelman 
24763660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
2477eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
24783660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
24793660e330SKris Buschelman 
24803660e330SKris Buschelman           /* First Column */
24813660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
24823660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
24833660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
24843660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
24853660e330SKris Buschelman 
24863660e330SKris Buschelman           /* Second Column */
24873660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
24883660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
24893660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
24903660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
24913660e330SKris Buschelman 
24923660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
24933660e330SKris Buschelman 
24943660e330SKris Buschelman           /* Third Column */
24953660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
24963660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
24973660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
24983660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
24993660e330SKris Buschelman 
25003660e330SKris Buschelman           /* Fourth Column */
25013660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25023660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25033660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25043660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25053660e330SKris Buschelman         SSE_INLINE_END_2
25063660e330SKris Buschelman 
25073660e330SKris Buschelman         v  += 16;
25083660e330SKris Buschelman       }
25093660e330SKris Buschelman       v    =  aa + 16*ai[++i];
25103660e330SKris Buschelman       PREFETCH_NTA(v);
2511eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
25123660e330SKris Buschelman     }
2513eb05f457SKris Buschelman 
2514eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
2515eb05f457SKris Buschelman 
25163660e330SKris Buschelman     idt  = 4*(n-1);
25173660e330SKris Buschelman     ai16 = 16*diag[n-1];
25183660e330SKris Buschelman     v    = aa + ai16 + 16;
25193660e330SKris Buschelman     for (i=n-1; i>=0;){
25203660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
25213660e330SKris Buschelman       vi = aj + diag[i] + 1;
25223660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
25233660e330SKris Buschelman 
2524eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
25253660e330SKris Buschelman 
25263660e330SKris Buschelman       while (nz--) {
25273660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
25282aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
25293660e330SKris Buschelman 
25303660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
2531eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
25323660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
25333660e330SKris Buschelman 
25343660e330SKris Buschelman           /* First Column */
25353660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
25363660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
25373660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
25383660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
25393660e330SKris Buschelman 
25403660e330SKris Buschelman           /* Second Column */
25413660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
25423660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
25433660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
25443660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
25453660e330SKris Buschelman 
25463660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
25473660e330SKris Buschelman 
25483660e330SKris Buschelman           /* Third Column */
25493660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
25503660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
25513660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
25523660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
25533660e330SKris Buschelman 
25543660e330SKris Buschelman           /* Fourth Column */
25553660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
25563660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
25573660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
25583660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
25593660e330SKris Buschelman         SSE_INLINE_END_2
25603660e330SKris Buschelman         v  += 16;
25613660e330SKris Buschelman       }
25623660e330SKris Buschelman       v    = aa + ai16;
25633660e330SKris Buschelman       ai16 = 16*diag[--i];
25643660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
25653660e330SKris Buschelman       /*
25663660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
25673660e330SKris Buschelman          which was inverted as part of the factorization
25683660e330SKris Buschelman       */
2569eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
25703660e330SKris Buschelman         /* First Column */
25713660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
25723660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
25733660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
25743660e330SKris Buschelman 
25753660e330SKris Buschelman         /* Second Column */
25763660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
25773660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
25783660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
25793660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
25803660e330SKris Buschelman 
25813660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
25823660e330SKris Buschelman 
25833660e330SKris Buschelman         /* Third Column */
25843660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
25853660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
25863660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
25873660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
25883660e330SKris Buschelman 
25893660e330SKris Buschelman         /* Fourth Column */
25903660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
25913660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
25923660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
25933660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
25943660e330SKris Buschelman 
25953660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
25963660e330SKris Buschelman       SSE_INLINE_END_3
25973660e330SKris Buschelman 
25983660e330SKris Buschelman       v    = aa + ai16 + 16;
25993660e330SKris Buschelman       idt -= 4;
26003660e330SKris Buschelman     }
2601eb05f457SKris Buschelman 
2602eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
2603eb05f457SKris Buschelman     idt = 4*(n-1);
2604eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
2605eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
2606eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
2607eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
2608eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
2609eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
2610eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
2611eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
2612eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
261354693613SKris Buschelman       idt -= 4;
26143660e330SKris Buschelman     }
2615eb05f457SKris Buschelman 
2616eb05f457SKris Buschelman   } /* End of artificial scope. */
26171ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
26181ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2619dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
26203660e330SKris Buschelman   SSE_SCOPE_END;
26213660e330SKris Buschelman   PetscFunctionReturn(0);
26223660e330SKris Buschelman }
26233660e330SKris Buschelman 
26247cf1b8d3SKris Buschelman #undef __FUNCT__
26257cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
2626dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
26277cf1b8d3SKris Buschelman {
26287cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
26297cf1b8d3SKris Buschelman   int            *aj=a->j;
2630dfbe8321SBarry Smith   PetscErrorCode ierr;
2631dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
26327cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
26337cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
26347cf1b8d3SKris Buschelman 
26357cf1b8d3SKris Buschelman   PetscFunctionBegin;
26367cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
26377cf1b8d3SKris Buschelman   /*
26387cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
26397cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
26407cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
26417cf1b8d3SKris Buschelman   */
26427cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
26437cf1b8d3SKris Buschelman 
26441ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
26451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
26467cf1b8d3SKris Buschelman   {
26477cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
26487cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
26497cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
26507cf1b8d3SKris Buschelman     int       jdx,idx;
26517cf1b8d3SKris Buschelman     int       *vi;
26527cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
26537cf1b8d3SKris Buschelman 
26547cf1b8d3SKris Buschelman     /* First block is the identity. */
26557cf1b8d3SKris Buschelman     idx  = 0;
26567cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
26577cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
26587cf1b8d3SKris Buschelman 
26597cf1b8d3SKris Buschelman     for (i=1; i<n;) {
26607cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
26617cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
26627cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
26637cf1b8d3SKris Buschelman       idx +=  4;
26647cf1b8d3SKris Buschelman 
26657cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
26667cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
26677cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
26687cf1b8d3SKris Buschelman 
26697cf1b8d3SKris Buschelman       while (nz--) {
26707cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
26717cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
26727cf1b8d3SKris Buschelman /*          jdx = *vi++; */
26737cf1b8d3SKris Buschelman 
26747cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
26757cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
26767cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
26777cf1b8d3SKris Buschelman 
26787cf1b8d3SKris Buschelman           /* First Column */
26797cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
26807cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
26817cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
26827cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
26837cf1b8d3SKris Buschelman 
26847cf1b8d3SKris Buschelman           /* Second Column */
26857cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
26867cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
26877cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
26887cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
26897cf1b8d3SKris Buschelman 
26907cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
26917cf1b8d3SKris Buschelman 
26927cf1b8d3SKris Buschelman           /* Third Column */
26937cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
26947cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
26957cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
26967cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
26977cf1b8d3SKris Buschelman 
26987cf1b8d3SKris Buschelman           /* Fourth Column */
26997cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
27007cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
27017cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
27027cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
27037cf1b8d3SKris Buschelman         SSE_INLINE_END_2
27047cf1b8d3SKris Buschelman 
27057cf1b8d3SKris Buschelman         v  += 16;
27067cf1b8d3SKris Buschelman       }
27077cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
27087cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
27097cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
27107cf1b8d3SKris Buschelman     }
27117cf1b8d3SKris Buschelman 
27127cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
27137cf1b8d3SKris Buschelman 
27147cf1b8d3SKris Buschelman     idt  = 4*(n-1);
27157cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
27167cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
27177cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
27187cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
27197cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
27207cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
27217cf1b8d3SKris Buschelman 
27227cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
27237cf1b8d3SKris Buschelman 
27247cf1b8d3SKris Buschelman       while (nz--) {
27257cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
27267cf1b8d3SKris Buschelman         idx = 4*(*vi++);
27277cf1b8d3SKris Buschelman /*          idx = *vi++; */
27287cf1b8d3SKris Buschelman 
27297cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
27307cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
27317cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
27327cf1b8d3SKris Buschelman 
27337cf1b8d3SKris Buschelman           /* First Column */
27347cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
27357cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
27367cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
27377cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
27387cf1b8d3SKris Buschelman 
27397cf1b8d3SKris Buschelman           /* Second Column */
27407cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
27417cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
27427cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
27437cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
27447cf1b8d3SKris Buschelman 
27457cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
27467cf1b8d3SKris Buschelman 
27477cf1b8d3SKris Buschelman           /* Third Column */
27487cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
27497cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
27507cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
27517cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
27527cf1b8d3SKris Buschelman 
27537cf1b8d3SKris Buschelman           /* Fourth Column */
27547cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
27557cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
27567cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
27577cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
27587cf1b8d3SKris Buschelman         SSE_INLINE_END_2
27597cf1b8d3SKris Buschelman         v  += 16;
27607cf1b8d3SKris Buschelman       }
27617cf1b8d3SKris Buschelman       v    = aa + ai16;
27627cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
27637cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
27647cf1b8d3SKris Buschelman       /*
27657cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
27667cf1b8d3SKris Buschelman          which was inverted as part of the factorization
27677cf1b8d3SKris Buschelman       */
27687cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
27697cf1b8d3SKris Buschelman         /* First Column */
27707cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
27717cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
27727cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
27737cf1b8d3SKris Buschelman 
27747cf1b8d3SKris Buschelman         /* Second Column */
27757cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
27767cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
27777cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
27787cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
27797cf1b8d3SKris Buschelman 
27807cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
27817cf1b8d3SKris Buschelman 
27827cf1b8d3SKris Buschelman         /* Third Column */
27837cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
27847cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
27857cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
27867cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
27877cf1b8d3SKris Buschelman 
27887cf1b8d3SKris Buschelman         /* Fourth Column */
27897cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
27907cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
27917cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
27927cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
27937cf1b8d3SKris Buschelman 
27947cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
27957cf1b8d3SKris Buschelman       SSE_INLINE_END_3
27967cf1b8d3SKris Buschelman 
27977cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
27987cf1b8d3SKris Buschelman       idt -= 4;
27997cf1b8d3SKris Buschelman     }
28007cf1b8d3SKris Buschelman 
28017cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
28027cf1b8d3SKris Buschelman     idt = 4*(n-1);
28037cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
28047cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
28057cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
28067cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
28077cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
28087cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
28097cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
28107cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
28117cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
28127cf1b8d3SKris Buschelman       idt -= 4;
28137cf1b8d3SKris Buschelman     }
28147cf1b8d3SKris Buschelman 
28157cf1b8d3SKris Buschelman   } /* End of artificial scope. */
28161ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
28171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2818dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
28197cf1b8d3SKris Buschelman   SSE_SCOPE_END;
28207cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
28217cf1b8d3SKris Buschelman }
28227cf1b8d3SKris Buschelman 
28233660e330SKris Buschelman #endif
28244a2ae208SSatish Balay #undef __FUNCT__
28254a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
2826dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
28274e2b4712SSatish Balay {
28284e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
28294e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
28306849ba73SBarry Smith   PetscErrorCode    ierr;
28315d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
28325d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2833d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2834d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
2835d9fead3dSBarry Smith   const PetscScalar *b;
28364e2b4712SSatish Balay 
28374e2b4712SSatish Balay   PetscFunctionBegin;
2838d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28391ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2840f1af5d2fSBarry Smith   t  = a->solve_work;
28414e2b4712SSatish Balay 
28424e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28434e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28444e2b4712SSatish Balay 
28454e2b4712SSatish Balay   /* forward solve the lower triangular */
28464e2b4712SSatish Balay   idx    = 3*(*r++);
2847f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
28484e2b4712SSatish Balay   for (i=1; i<n; i++) {
28494e2b4712SSatish Balay     v     = aa + 9*ai[i];
28504e2b4712SSatish Balay     vi    = aj + ai[i];
28514e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28524e2b4712SSatish Balay     idx   = 3*(*r++);
2853f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
28544e2b4712SSatish Balay     while (nz--) {
28554e2b4712SSatish Balay       idx   = 3*(*vi++);
2856f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2857f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2858f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2859f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
28604e2b4712SSatish Balay       v += 9;
28614e2b4712SSatish Balay     }
28624e2b4712SSatish Balay     idx = 3*i;
2863f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
28644e2b4712SSatish Balay   }
28654e2b4712SSatish Balay   /* backward solve the upper triangular */
28664e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28674e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
28684e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28694e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28704e2b4712SSatish Balay     idt  = 3*i;
2871f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
28724e2b4712SSatish Balay     while (nz--) {
28734e2b4712SSatish Balay       idx   = 3*(*vi++);
2874f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
2875f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2876f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2877f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
28784e2b4712SSatish Balay       v += 9;
28794e2b4712SSatish Balay     }
28804e2b4712SSatish Balay     idc = 3*(*c--);
28814e2b4712SSatish Balay     v   = aa + 9*diag[i];
2882f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2883f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2884f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
28854e2b4712SSatish Balay   }
28864e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28874e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2888d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28891ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2890dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
28914e2b4712SSatish Balay   PetscFunctionReturn(0);
28924e2b4712SSatish Balay }
28934e2b4712SSatish Balay 
289415091d37SBarry Smith /*
289515091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
289615091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
289715091d37SBarry Smith */
28984a2ae208SSatish Balay #undef __FUNCT__
28994a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
2900dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
290115091d37SBarry Smith {
290215091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2903690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
2904dfbe8321SBarry Smith   PetscErrorCode    ierr;
2905690b6cddSBarry Smith   PetscInt          *diag = a->diag;
2906d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2907d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
2908d9fead3dSBarry Smith   const PetscScalar *b;
2909690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
291015091d37SBarry Smith 
291115091d37SBarry Smith   PetscFunctionBegin;
2912d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29131ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
291415091d37SBarry Smith 
291515091d37SBarry Smith   /* forward solve the lower triangular */
291615091d37SBarry Smith   idx    = 0;
291715091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
291815091d37SBarry Smith   for (i=1; i<n; i++) {
291915091d37SBarry Smith     v     =  aa      + 9*ai[i];
292015091d37SBarry Smith     vi    =  aj      + ai[i];
292115091d37SBarry Smith     nz    =  diag[i] - ai[i];
292215091d37SBarry Smith     idx   +=  3;
2923f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
292415091d37SBarry Smith     while (nz--) {
292515091d37SBarry Smith       jdx   = 3*(*vi++);
292615091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
2927f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2928f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2929f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
293015091d37SBarry Smith       v    += 9;
293115091d37SBarry Smith     }
2932f1af5d2fSBarry Smith     x[idx]   = s1;
2933f1af5d2fSBarry Smith     x[1+idx] = s2;
2934f1af5d2fSBarry Smith     x[2+idx] = s3;
293515091d37SBarry Smith   }
293615091d37SBarry Smith   /* backward solve the upper triangular */
293715091d37SBarry Smith   for (i=n-1; i>=0; i--){
293815091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
293915091d37SBarry Smith     vi   = aj + diag[i] + 1;
294015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
294115091d37SBarry Smith     idt  = 3*i;
2942f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2943f1af5d2fSBarry Smith     s3 = x[2+idt];
294415091d37SBarry Smith     while (nz--) {
294515091d37SBarry Smith       idx   = 3*(*vi++);
294615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
2947f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
2948f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
2949f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
295015091d37SBarry Smith       v    += 9;
295115091d37SBarry Smith     }
295215091d37SBarry Smith     v        = aa +  9*diag[i];
2953f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
2954f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
2955f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
295615091d37SBarry Smith   }
295715091d37SBarry Smith 
2958d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29591ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2960dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
296115091d37SBarry Smith   PetscFunctionReturn(0);
296215091d37SBarry Smith }
296315091d37SBarry Smith 
29644a2ae208SSatish Balay #undef __FUNCT__
29654a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
2966dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
29674e2b4712SSatish Balay {
29684e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
29694e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
29706849ba73SBarry Smith   PetscErrorCode    ierr;
29715d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
29725d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2973d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2974d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
2975d9fead3dSBarry Smith   const PetscScalar *b;
29764e2b4712SSatish Balay 
29774e2b4712SSatish Balay   PetscFunctionBegin;
2978d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29791ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2980f1af5d2fSBarry Smith   t  = a->solve_work;
29814e2b4712SSatish Balay 
29824e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29834e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
29844e2b4712SSatish Balay 
29854e2b4712SSatish Balay   /* forward solve the lower triangular */
29864e2b4712SSatish Balay   idx    = 2*(*r++);
2987f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
29884e2b4712SSatish Balay   for (i=1; i<n; i++) {
29894e2b4712SSatish Balay     v     = aa + 4*ai[i];
29904e2b4712SSatish Balay     vi    = aj + ai[i];
29914e2b4712SSatish Balay     nz    = diag[i] - ai[i];
29924e2b4712SSatish Balay     idx   = 2*(*r++);
2993f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
29944e2b4712SSatish Balay     while (nz--) {
29954e2b4712SSatish Balay       idx   = 2*(*vi++);
2996f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
2997f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
2998f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
29994e2b4712SSatish Balay       v += 4;
30004e2b4712SSatish Balay     }
30014e2b4712SSatish Balay     idx = 2*i;
3002f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
30034e2b4712SSatish Balay   }
30044e2b4712SSatish Balay   /* backward solve the upper triangular */
30054e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30064e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
30074e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30084e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
30094e2b4712SSatish Balay     idt  = 2*i;
3010f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
30114e2b4712SSatish Balay     while (nz--) {
30124e2b4712SSatish Balay       idx   = 2*(*vi++);
3013f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
3014f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3015f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
30164e2b4712SSatish Balay       v += 4;
30174e2b4712SSatish Balay     }
30184e2b4712SSatish Balay     idc = 2*(*c--);
30194e2b4712SSatish Balay     v   = aa + 4*diag[i];
3020f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
3021f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
30224e2b4712SSatish Balay   }
30234e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30244e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3025d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30261ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3027dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
30284e2b4712SSatish Balay   PetscFunctionReturn(0);
30294e2b4712SSatish Balay }
30304e2b4712SSatish Balay 
303115091d37SBarry Smith /*
303215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
303315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
303415091d37SBarry Smith */
30354a2ae208SSatish Balay #undef __FUNCT__
30364a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
3037dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
303815091d37SBarry Smith {
303915091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3040690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
3041dfbe8321SBarry Smith   PetscErrorCode    ierr;
3042690b6cddSBarry Smith   PetscInt          *diag = a->diag;
3043d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3044d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
3045d9fead3dSBarry Smith   const PetscScalar *b;
3046690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
304715091d37SBarry Smith 
304815091d37SBarry Smith   PetscFunctionBegin;
3049d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30501ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
305115091d37SBarry Smith 
305215091d37SBarry Smith   /* forward solve the lower triangular */
305315091d37SBarry Smith   idx    = 0;
305415091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
305515091d37SBarry Smith   for (i=1; i<n; i++) {
305615091d37SBarry Smith     v     =  aa      + 4*ai[i];
305715091d37SBarry Smith     vi    =  aj      + ai[i];
305815091d37SBarry Smith     nz    =  diag[i] - ai[i];
305915091d37SBarry Smith     idx   +=  2;
3060f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
306115091d37SBarry Smith     while (nz--) {
306215091d37SBarry Smith       jdx   = 2*(*vi++);
306315091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
3064f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3065f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
306615091d37SBarry Smith       v    += 4;
306715091d37SBarry Smith     }
3068f1af5d2fSBarry Smith     x[idx]   = s1;
3069f1af5d2fSBarry Smith     x[1+idx] = s2;
307015091d37SBarry Smith   }
307115091d37SBarry Smith   /* backward solve the upper triangular */
307215091d37SBarry Smith   for (i=n-1; i>=0; i--){
307315091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
307415091d37SBarry Smith     vi   = aj + diag[i] + 1;
307515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
307615091d37SBarry Smith     idt  = 2*i;
3077f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
307815091d37SBarry Smith     while (nz--) {
307915091d37SBarry Smith       idx   = 2*(*vi++);
308015091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
3081f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
3082f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
308315091d37SBarry Smith       v    += 4;
308415091d37SBarry Smith     }
308515091d37SBarry Smith     v        = aa +  4*diag[i];
3086f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
3087f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
308815091d37SBarry Smith   }
308915091d37SBarry Smith 
3090d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30911ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3092dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
309315091d37SBarry Smith   PetscFunctionReturn(0);
309415091d37SBarry Smith }
309515091d37SBarry Smith 
30964a2ae208SSatish Balay #undef __FUNCT__
30974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
3098dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
30994e2b4712SSatish Balay {
31004e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
31014e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
31026849ba73SBarry Smith   PetscErrorCode ierr;
31035d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
31045d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
31053f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
310687828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
31074e2b4712SSatish Balay 
31084e2b4712SSatish Balay   PetscFunctionBegin;
31094e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
31104e2b4712SSatish Balay 
31111ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
31121ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3113f1af5d2fSBarry Smith   t  = a->solve_work;
31144e2b4712SSatish Balay 
31154e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
31164e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
31174e2b4712SSatish Balay 
31184e2b4712SSatish Balay   /* forward solve the lower triangular */
3119f1af5d2fSBarry Smith   t[0] = b[*r++];
31204e2b4712SSatish Balay   for (i=1; i<n; i++) {
31214e2b4712SSatish Balay     v     = aa + ai[i];
31224e2b4712SSatish Balay     vi    = aj + ai[i];
31234e2b4712SSatish Balay     nz    = diag[i] - ai[i];
3124f1af5d2fSBarry Smith     s1  = b[*r++];
31254e2b4712SSatish Balay     while (nz--) {
3126f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
31274e2b4712SSatish Balay     }
3128f1af5d2fSBarry Smith     t[i] = s1;
31294e2b4712SSatish Balay   }
31304e2b4712SSatish Balay   /* backward solve the upper triangular */
31314e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
31324e2b4712SSatish Balay     v    = aa + diag[i] + 1;
31334e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
31344e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3135f1af5d2fSBarry Smith     s1 = t[i];
31364e2b4712SSatish Balay     while (nz--) {
3137f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
31384e2b4712SSatish Balay     }
3139f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
31404e2b4712SSatish Balay   }
31414e2b4712SSatish Balay 
31424e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
31434e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
31441ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
31451ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3146dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
31474e2b4712SSatish Balay   PetscFunctionReturn(0);
31484e2b4712SSatish Balay }
314915091d37SBarry Smith /*
315015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
315115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
315215091d37SBarry Smith */
31534a2ae208SSatish Balay #undef __FUNCT__
31544a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
3155dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
315615091d37SBarry Smith {
315715091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3158690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3159dfbe8321SBarry Smith   PetscErrorCode ierr;
3160690b6cddSBarry Smith   PetscInt       *diag = a->diag;
316115091d37SBarry Smith   MatScalar      *aa=a->a;
316287828ca2SBarry Smith   PetscScalar    *x,*b;
316387828ca2SBarry Smith   PetscScalar    s1,x1;
316415091d37SBarry Smith   MatScalar      *v;
3165690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
316615091d37SBarry Smith 
316715091d37SBarry Smith   PetscFunctionBegin;
31681ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
31691ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
317015091d37SBarry Smith 
317115091d37SBarry Smith   /* forward solve the lower triangular */
317215091d37SBarry Smith   idx    = 0;
317315091d37SBarry Smith   x[0]   = b[0];
317415091d37SBarry Smith   for (i=1; i<n; i++) {
317515091d37SBarry Smith     v     =  aa      + ai[i];
317615091d37SBarry Smith     vi    =  aj      + ai[i];
317715091d37SBarry Smith     nz    =  diag[i] - ai[i];
317815091d37SBarry Smith     idx   +=  1;
3179f1af5d2fSBarry Smith     s1  =  b[idx];
318015091d37SBarry Smith     while (nz--) {
318115091d37SBarry Smith       jdx   = *vi++;
318215091d37SBarry Smith       x1    = x[jdx];
3183f1af5d2fSBarry Smith       s1 -= v[0]*x1;
318415091d37SBarry Smith       v    += 1;
318515091d37SBarry Smith     }
3186f1af5d2fSBarry Smith     x[idx]   = s1;
318715091d37SBarry Smith   }
318815091d37SBarry Smith   /* backward solve the upper triangular */
318915091d37SBarry Smith   for (i=n-1; i>=0; i--){
319015091d37SBarry Smith     v    = aa + diag[i] + 1;
319115091d37SBarry Smith     vi   = aj + diag[i] + 1;
319215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
319315091d37SBarry Smith     idt  = i;
3194f1af5d2fSBarry Smith     s1 = x[idt];
319515091d37SBarry Smith     while (nz--) {
319615091d37SBarry Smith       idx   = *vi++;
319715091d37SBarry Smith       x1    = x[idx];
3198f1af5d2fSBarry Smith       s1 -= v[0]*x1;
319915091d37SBarry Smith       v    += 1;
320015091d37SBarry Smith     }
320115091d37SBarry Smith     v        = aa +  diag[i];
3202f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
320315091d37SBarry Smith   }
32041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
32051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3206dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
320715091d37SBarry Smith   PetscFunctionReturn(0);
320815091d37SBarry Smith }
32094e2b4712SSatish Balay 
32104e2b4712SSatish Balay /* ----------------------------------------------------------------*/
32116bce7ff8SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption);
32126bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
32136bce7ff8SHong Zhang 
321484a281e5SHong Zhang extern PetscErrorCode MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct(Mat,Vec,Vec);
32156bce7ff8SHong Zhang #undef __FUNCT__
32166bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
32176bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
32186bce7ff8SHong Zhang {
32196bce7ff8SHong Zhang   Mat            C=B;
32206bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
32216bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
32226bce7ff8SHong Zhang   PetscErrorCode ierr;
32236bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
32246bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
32256bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
3226914a18a2SHong Zhang   MatScalar      *rtmp,*pc,*multiplier,*v,*pv,*aa=a->a;
3227914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
3228914a18a2SHong Zhang   MatScalar      *v_work;
32296bce7ff8SHong Zhang 
32306bce7ff8SHong Zhang   PetscFunctionBegin;
32316bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
32326bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
3233914a18a2SHong Zhang   ierr = PetscMalloc((bs2*n+1)*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
3234914a18a2SHong Zhang   ierr = PetscMemzero(rtmp,(bs2*n+1)*sizeof(MatScalar));CHKERRQ(ierr);
32356bce7ff8SHong Zhang   ics  = ic;
32366bce7ff8SHong Zhang 
3237914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
3238914a18a2SHong Zhang   ierr       = PetscMalloc(bs*sizeof(PetscInt) + (bs+bs2)*sizeof(MatScalar),&v_work);CHKERRQ(ierr);
3239914a18a2SHong Zhang   multiplier = v_work + bs;
3240914a18a2SHong Zhang   v_pivots   = (PetscInt*)(multiplier + bs2);
3241914a18a2SHong Zhang 
32426bce7ff8SHong Zhang   for (i=0; i<n; i++){
32436bce7ff8SHong Zhang     /* zero rtmp */
32446bce7ff8SHong Zhang     /* L part */
32456bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
32466bce7ff8SHong Zhang     bjtmp = bj + bi[i];
3247914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3248914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3249914a18a2SHong Zhang     }
32506bce7ff8SHong Zhang 
32516bce7ff8SHong Zhang     /* U part */
32526bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i];
32536bce7ff8SHong Zhang     bjtmp = bj + bi[2*n-i];
3254914a18a2SHong Zhang     for  (j=0; j<nz; j++){
3255914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3256914a18a2SHong Zhang     }
32576bce7ff8SHong Zhang 
32586bce7ff8SHong Zhang     /* load in initial (unfactored row) */
32596bce7ff8SHong Zhang     nz    = ai[r[i]+1] - ai[r[i]];
32606bce7ff8SHong Zhang     ajtmp = aj + ai[r[i]];
3261914a18a2SHong Zhang     v     = aa + bs2*ai[r[i]];
32626bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3263914a18a2SHong Zhang       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
32646bce7ff8SHong Zhang     }
32656bce7ff8SHong Zhang 
32666bce7ff8SHong Zhang     /* elimination */
32676bce7ff8SHong Zhang     bjtmp = bj + bi[i];
32686bce7ff8SHong Zhang     row   = *bjtmp++;
32696bce7ff8SHong Zhang     nzL   = bi[i+1] - bi[i];
32706bce7ff8SHong Zhang     k   = 0;
32716bce7ff8SHong Zhang     while  (k < nzL) {
3272914a18a2SHong Zhang       pc = rtmp + bs2*row;
3273914a18a2SHong Zhang       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
3274914a18a2SHong Zhang       if (flg) {
3275914a18a2SHong Zhang         pv         = b->a + bs2*bdiag[row];
3276914a18a2SHong Zhang         Kernel_A_gets_A_times_B(bs,pc,pv,multiplier); /* *pc = *pc * (*pv); */
32776bce7ff8SHong Zhang         pj         = b->j + bi[2*n-row]; /* begining of U(row,:) */
3278914a18a2SHong Zhang         pv         = b->a + bs2*bi[2*n-row];
32796bce7ff8SHong Zhang         nz         = bi[2*n-row+1] - bi[2*n-row] - 1; /* num of entries inU(row,:), excluding diag */
3280914a18a2SHong Zhang         for (j=0; j<nz; j++) {
3281914a18a2SHong Zhang           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
3282914a18a2SHong Zhang         }
32836bce7ff8SHong Zhang         ierr = PetscLogFlops(2.0*nz);CHKERRQ(ierr);
32846bce7ff8SHong Zhang       }
32856bce7ff8SHong Zhang       row = *bjtmp++; k++;
32866bce7ff8SHong Zhang     }
32876bce7ff8SHong Zhang 
32886bce7ff8SHong Zhang     /* finished row so stick it into b->a */
32896bce7ff8SHong Zhang     /* L part */
3290914a18a2SHong Zhang     pv   = b->a + bs2*bi[i] ;
32916bce7ff8SHong Zhang     pj   = b->j + bi[i] ;
32926bce7ff8SHong Zhang     nz   = bi[i+1] - bi[i];
32936bce7ff8SHong Zhang     for (j=0; j<nz; j++) {
3294914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
32956bce7ff8SHong Zhang     }
32966bce7ff8SHong Zhang 
32976bce7ff8SHong Zhang     /* Mark diagonal and invert diagonal for simplier triangular solves */
3298914a18a2SHong Zhang     pv  = b->a + bs2*bdiag[i];
32996bce7ff8SHong Zhang     pj  = b->j + bdiag[i];
3300914a18a2SHong Zhang     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
3301914a18a2SHong Zhang     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3302914a18a2SHong Zhang     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
33036bce7ff8SHong Zhang 
33046bce7ff8SHong Zhang     /* U part */
3305914a18a2SHong Zhang     pv = b->a + bs2*bi[2*n-i];
33066bce7ff8SHong Zhang     pj = b->j + bi[2*n-i];
33076bce7ff8SHong Zhang     nz = bi[2*n-i+1] - bi[2*n-i] - 1;
3308914a18a2SHong Zhang     for (j=0; j<nz; j++){
3309914a18a2SHong Zhang       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
3310914a18a2SHong Zhang     }
33116bce7ff8SHong Zhang   }
33126bce7ff8SHong Zhang 
33136bce7ff8SHong Zhang   ierr = PetscFree(rtmp);CHKERRQ(ierr);
33146bce7ff8SHong Zhang   ierr = PetscFree(v_work);CHKERRQ(ierr);
33156bce7ff8SHong Zhang   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
33166bce7ff8SHong Zhang   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
3317*27019359SHong Zhang 
3318*27019359SHong Zhang   switch (A->rmap->bs){
3319*27019359SHong Zhang   case 2:
3320*27019359SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct;
3321*27019359SHong Zhang     break;
3322*27019359SHong Zhang 
3323*27019359SHong Zhang   case 5:
332484a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct;
3325*27019359SHong Zhang     break;
3326*27019359SHong Zhang   default:
332784a281e5SHong Zhang     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
3328*27019359SHong Zhang     break;
332984a281e5SHong Zhang   }
33306bce7ff8SHong Zhang   C->assembled = PETSC_TRUE;
3331914a18a2SHong Zhang   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
33326bce7ff8SHong Zhang   PetscFunctionReturn(0);
33336bce7ff8SHong Zhang }
33346bce7ff8SHong Zhang 
33356bce7ff8SHong Zhang /*
33366bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
33376bce7ff8SHong Zhang    Factored arrays bj and ba are stored as
33386bce7ff8SHong Zhang      L(0,:), L(1,:), ...,L(n-1,:),  U(n-1,:),...,U(i,:),U(i-1,:),...,U(0,:)
33396bce7ff8SHong Zhang 
33406bce7ff8SHong Zhang    bi=fact->i is an array of size 2n+2, in which
33416bce7ff8SHong Zhang    bi+
33426bce7ff8SHong Zhang      bi[i]      ->  1st entry of L(i,:),i=0,...,i-1
33436bce7ff8SHong Zhang      bi[n]      ->  end of L(n-1,:)+1
33446bce7ff8SHong Zhang      bi[n+1]    ->  1st entry of U(n-1,:)
33456bce7ff8SHong Zhang      bi[2n-i]   ->  1st entry of U(i,:)
33466bce7ff8SHong Zhang      bi[2n-i+1] ->  end of U(i,:)+1, the 1st entry of U(i-1,:)
33476bce7ff8SHong Zhang      bi[2n]     ->  end of U(0,:)+1
33486bce7ff8SHong Zhang 
33496bce7ff8SHong Zhang    U(i,:) contains diag[i] as its last entry, i.e.,
33506bce7ff8SHong Zhang     U(i,:) = (u[i,i+1],...,u[i,n-1],diag[i])
33516bce7ff8SHong Zhang */
33526bce7ff8SHong Zhang #undef __FUNCT__
33536bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
33546bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
33556bce7ff8SHong Zhang {
33566bce7ff8SHong Zhang 
33576bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
33586bce7ff8SHong Zhang   PetscErrorCode     ierr;
3359914a18a2SHong Zhang   PetscInt           mbs=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
33606bce7ff8SHong Zhang   PetscInt           i,j,nz=a->nz,*bi,*bj,*bdiag;
33616bce7ff8SHong Zhang 
33626bce7ff8SHong Zhang   PetscFunctionBegin;
33636bce7ff8SHong Zhang   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
33646bce7ff8SHong Zhang   b     = (Mat_SeqBAIJ*)(fact)->data;
3365914a18a2SHong Zhang   bdiag = b->diag;
33666bce7ff8SHong Zhang 
33676bce7ff8SHong Zhang   /* replace matrix arrays with single allocations, then reset values */
33686bce7ff8SHong Zhang   ierr = PetscFree3(b->a,b->j,b->i);CHKERRQ(ierr);
33696bce7ff8SHong Zhang 
33706bce7ff8SHong Zhang   ierr = PetscMalloc((2*mbs+2)*sizeof(PetscInt),&b->i);CHKERRQ(ierr);
33716bce7ff8SHong Zhang   ierr = PetscMalloc((nz+1)*sizeof(PetscInt),&b->j);CHKERRQ(ierr);
33726bce7ff8SHong Zhang   ierr = PetscMalloc((bs2*nz+1)*sizeof(PetscScalar),&b->a);CHKERRQ(ierr);
33736bce7ff8SHong Zhang   b->singlemalloc = PETSC_FALSE;
33746bce7ff8SHong Zhang   if (mbs > 0) {
33756bce7ff8SHong Zhang     ierr = PetscMemzero(b->a,bs2*nz*sizeof(MatScalar));CHKERRQ(ierr);
33766bce7ff8SHong Zhang   }
33776bce7ff8SHong Zhang 
33786bce7ff8SHong Zhang   /* set bi and bj with new data structure */
33796bce7ff8SHong Zhang   bi = b->i;
33806bce7ff8SHong Zhang   bj = b->j;
33816bce7ff8SHong Zhang 
33826bce7ff8SHong Zhang   /* L part */
33836bce7ff8SHong Zhang   bi[0] = 0;
33846bce7ff8SHong Zhang   for (i=0; i<mbs; i++){
33856bce7ff8SHong Zhang     nz = adiag[i] - ai[i];
3386914a18a2SHong Zhang     bi[i+1] = bi[i] + nz;
33876bce7ff8SHong Zhang     aj = a->j + ai[i];
33886bce7ff8SHong Zhang     for (j=0; j<nz; j++){
33896bce7ff8SHong Zhang       *bj = aj[j]; bj++;
33906bce7ff8SHong Zhang     }
33916bce7ff8SHong Zhang   }
33926bce7ff8SHong Zhang 
33936bce7ff8SHong Zhang   /* U part */
33946bce7ff8SHong Zhang   bi[mbs+1] = bi[mbs];
33956bce7ff8SHong Zhang   for (i=mbs-1; i>=0; i--){
33966bce7ff8SHong Zhang     nz = ai[i+1] - adiag[i] - 1;
33976bce7ff8SHong Zhang     if (nz < 0) SETERRQ2(0,"row %d Unz %d",i,nz);
3398914a18a2SHong Zhang     bi[2*mbs-i+1] = bi[2*mbs-i] + nz + 1;
33996bce7ff8SHong Zhang     aj = a->j + adiag[i] + 1;
34006bce7ff8SHong Zhang     for (j=0; j<nz; j++){
34016bce7ff8SHong Zhang       *bj = aj[j]; bj++;
34026bce7ff8SHong Zhang     }
34036bce7ff8SHong Zhang     /* diag[i] */
34046bce7ff8SHong Zhang     *bj = i; bj++;
34056bce7ff8SHong Zhang     bdiag[i] = bi[2*mbs-i+1]-1;
34066bce7ff8SHong Zhang   }
34076bce7ff8SHong Zhang   PetscFunctionReturn(0);
34086bce7ff8SHong Zhang }
34096bce7ff8SHong Zhang 
34104e2b4712SSatish Balay /*
34114e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
34124e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
34134e2b4712SSatish Balay    Not a good example of code reuse.
34144e2b4712SSatish Balay */
3415435faa5fSBarry Smith 
34164a2ae208SSatish Balay #undef __FUNCT__
34174a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
34180481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
34194e2b4712SSatish Balay {
34204e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
34214e2b4712SSatish Balay   IS             isicol;
34226849ba73SBarry Smith   PetscErrorCode ierr;
34235d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
34245d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
3425a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
3426d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
342741df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
3428329f5518SBarry Smith   PetscReal      f;
34294e2b4712SSatish Balay 
34304e2b4712SSatish Balay   PetscFunctionBegin;
34316bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
34326bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
34336bce7ff8SHong Zhang 
3434435faa5fSBarry Smith   f             = info->fill;
3435690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
3436690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
34374c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
3438667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
3439667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
34407d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
3441309c388cSBarry Smith 
344241df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
34436bce7ff8SHong Zhang 
34446bce7ff8SHong Zhang     PetscTruth newdatastruct=PETSC_FALSE;
34456bce7ff8SHong Zhang     ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
34466bce7ff8SHong Zhang     if (newdatastruct){
34476bce7ff8SHong Zhang       ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
34486bce7ff8SHong Zhang       (fact)->ops->lufactornumeric = MatLUFactorNumeric_SeqBAIJ_N_newdatastruct;
34496bce7ff8SHong Zhang     } else {
3450719d5645SBarry Smith       ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES);CHKERRQ(ierr);
34516bce7ff8SHong Zhang       ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
34526bce7ff8SHong Zhang     }
34536bce7ff8SHong Zhang 
3454719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
3455719d5645SBarry Smith     b            = (Mat_SeqBAIJ*)(fact)->data;
3456bb3d539aSBarry Smith     b->row       = isrow;
3457bb3d539aSBarry Smith     b->col       = iscol;
3458bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3459bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3460bb3d539aSBarry Smith     b->icol      = isicol;
3461bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3462719d5645SBarry Smith     ierr         = PetscMalloc(((fact)->rmap->N+1+(fact)->rmap->bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
34636bce7ff8SHong Zhang     PetscFunctionReturn(0);
34646bce7ff8SHong Zhang   }
34656bce7ff8SHong Zhang 
34666bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
34674e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
34684e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
34694e2b4712SSatish Balay 
34704e2b4712SSatish Balay     /* get new row pointers */
3471690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
34724e2b4712SSatish Balay     ainew[0] = 0;
34734e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
3474690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
3475690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
34764e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
3477690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
34784e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
3479690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
34804e2b4712SSatish Balay     /* im is level for each filled value */
3481690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
34824e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
3483690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
34844e2b4712SSatish Balay     dloc[0]  = 0;
34854e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
3486435faa5fSBarry Smith 
3487435faa5fSBarry Smith       /* copy prow into linked list */
34884e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
34893b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
34904e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
34914e2b4712SSatish Balay       fill[n]    = n;
3492435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
34934e2b4712SSatish Balay       while (nz--) {
34944e2b4712SSatish Balay 	fm  = n;
34954e2b4712SSatish Balay 	idx = ic[*xi++];
34964e2b4712SSatish Balay 	do {
34974e2b4712SSatish Balay 	  m  = fm;
34984e2b4712SSatish Balay 	  fm = fill[m];
34994e2b4712SSatish Balay 	} while (fm < idx);
35004e2b4712SSatish Balay 	fill[m]   = idx;
35014e2b4712SSatish Balay 	fill[idx] = fm;
35024e2b4712SSatish Balay 	im[idx]   = 0;
35034e2b4712SSatish Balay       }
3504435faa5fSBarry Smith 
3505435faa5fSBarry Smith       /* make sure diagonal entry is included */
3506435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
3507435faa5fSBarry Smith 	fm = n;
3508435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
3509435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
3510435faa5fSBarry Smith 	fill[fm]   = prow;
3511435faa5fSBarry Smith 	im[prow]   = 0;
3512435faa5fSBarry Smith 	nzf++;
3513335d9088SBarry Smith 	dcount++;
3514435faa5fSBarry Smith       }
3515435faa5fSBarry Smith 
35164e2b4712SSatish Balay       nzi = 0;
35174e2b4712SSatish Balay       row = fill[n];
35184e2b4712SSatish Balay       while (row < prow) {
35194e2b4712SSatish Balay 	incrlev = im[row] + 1;
35204e2b4712SSatish Balay 	nz      = dloc[row];
3521435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
35224e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
35234e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
35244e2b4712SSatish Balay 	fm      = row;
35254e2b4712SSatish Balay 	while (nnz-- > 0) {
35264e2b4712SSatish Balay 	  idx = *xi++;
35274e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
35284e2b4712SSatish Balay 	    flev++;
35294e2b4712SSatish Balay 	    continue;
35304e2b4712SSatish Balay 	  }
35314e2b4712SSatish Balay 	  do {
35324e2b4712SSatish Balay 	    m  = fm;
35334e2b4712SSatish Balay 	    fm = fill[m];
35344e2b4712SSatish Balay 	  } while (fm < idx);
35354e2b4712SSatish Balay 	  if (fm != idx) {
35364e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
35374e2b4712SSatish Balay 	    fill[m]   = idx;
35384e2b4712SSatish Balay 	    fill[idx] = fm;
35394e2b4712SSatish Balay 	    fm        = idx;
35404e2b4712SSatish Balay 	    nzf++;
3541ecf371e4SBarry Smith 	  } else {
35424e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
35434e2b4712SSatish Balay 	  }
35444e2b4712SSatish Balay 	  flev++;
35454e2b4712SSatish Balay 	}
35464e2b4712SSatish Balay 	row = fill[row];
35474e2b4712SSatish Balay 	nzi++;
35484e2b4712SSatish Balay       }
35494e2b4712SSatish Balay       /* copy new filled row into permanent storage */
35504e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
35514e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
3552ecf371e4SBarry Smith 
3553ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
3554ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
3555ecf371e4SBarry Smith 	/* just double the memory each time */
3556690b6cddSBarry Smith 	PetscInt maxadd = jmax;
3557ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
35584e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
35594e2b4712SSatish Balay 	jmax += maxadd;
3560ecf371e4SBarry Smith 
3561ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
35625d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
35635d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3564606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
35655d0c19d7SBarry Smith 	ajnew = xitmp;
35665d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
35675d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
3568606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
35695d0c19d7SBarry Smith 	ajfill = xitmp;
3570eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
35714e2b4712SSatish Balay       }
35725d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
35734e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
35744e2b4712SSatish Balay       dloc[prow]  = nzi;
35754e2b4712SSatish Balay       fm          = fill[n];
35764e2b4712SSatish Balay       while (nzf--) {
35775d0c19d7SBarry Smith 	*xitmp++ = fm;
35784e2b4712SSatish Balay 	*flev++ = im[fm];
35794e2b4712SSatish Balay 	fm      = fill[fm];
35804e2b4712SSatish Balay       }
3581435faa5fSBarry Smith       /* make sure row has diagonal entry */
3582435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
358377431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
35842401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
3585435faa5fSBarry Smith       }
35864e2b4712SSatish Balay     }
3587606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
35884e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
35894e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
3590606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
3591606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
35924e2b4712SSatish Balay 
35936cf91177SBarry Smith #if defined(PETSC_USE_INFO)
35944e2b4712SSatish Balay     {
3595329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
3596ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
3597ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
3598ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
3599ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
3600335d9088SBarry Smith       if (diagonal_fill) {
3601ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
3602335d9088SBarry Smith       }
36034e2b4712SSatish Balay     }
360463ba0a88SBarry Smith #endif
36054e2b4712SSatish Balay 
36064e2b4712SSatish Balay     /* put together the new matrix */
3607719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
3608719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
3609719d5645SBarry Smith     b    = (Mat_SeqBAIJ*)(fact)->data;
3610e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
3611e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
36127c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
3613a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
36144e2b4712SSatish Balay     b->j          = ajnew;
36154e2b4712SSatish Balay     b->i          = ainew;
36164e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
36174e2b4712SSatish Balay     b->diag       = dloc;
36184e2b4712SSatish Balay     b->ilen       = 0;
36194e2b4712SSatish Balay     b->imax       = 0;
36204e2b4712SSatish Balay     b->row        = isrow;
36214e2b4712SSatish Balay     b->col        = iscol;
3622bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
3623c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
3624c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
3625e51c0b9cSSatish Balay     b->icol       = isicol;
362687828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
36274e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
36284e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
3629719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
36304e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
36314e2b4712SSatish Balay 
3632719d5645SBarry Smith     (fact)->info.factor_mallocs    = reallocate;
3633719d5645SBarry Smith     (fact)->info.fill_ratio_given  = f;
3634719d5645SBarry Smith     (fact)->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
36356bce7ff8SHong Zhang 
363641df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
36378661488fSKris Buschelman   PetscFunctionReturn(0);
36388661488fSKris Buschelman }
36398661488fSKris Buschelman 
3640732ee342SKris Buschelman #undef __FUNCT__
36417e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
3642dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
36437e7071cdSKris Buschelman {
364412272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
364512272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
36465a9542e3SKris Buschelman   PetscFunctionBegin;
36477cf1b8d3SKris Buschelman   /* Undo Column scaling */
36487cf1b8d3SKris Buschelman /*    while (nz--) { */
36497cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
36507cf1b8d3SKris Buschelman /*    } */
3651c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
3652c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
36537cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
36547cf1b8d3SKris Buschelman }
36557cf1b8d3SKris Buschelman 
36567cf1b8d3SKris Buschelman #undef __FUNCT__
36577cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
3658dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
36597cf1b8d3SKris Buschelman {
36607cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3661b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
36622aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
36635a9542e3SKris Buschelman   PetscFunctionBegin;
36640b9da03eSKris Buschelman   /* Is this really necessary? */
366520235379SKris Buschelman   while (nz--) {
36660b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
36677e7071cdSKris Buschelman   }
3668c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
36697e7071cdSKris Buschelman   PetscFunctionReturn(0);
36707e7071cdSKris Buschelman }
36717e7071cdSKris Buschelman 
3672732ee342SKris Buschelman 
3673