xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 8499736ad707674778426e240b94526f94a8d49e)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
144a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering"
15dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
594a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
60dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1186929473cSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct"
1196929473cSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1206929473cSShri Abhyankar {
1216929473cSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1226929473cSShri Abhyankar   PetscErrorCode ierr;
1236929473cSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1246929473cSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
1256929473cSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1266929473cSShri Abhyankar   MatScalar      *aa=a->a,*v;
1276929473cSShri Abhyankar   PetscScalar    s1,s2,x1,x2;
1286929473cSShri Abhyankar   PetscScalar    *x,*b;
1296929473cSShri Abhyankar 
1306929473cSShri Abhyankar   PetscFunctionBegin;
1316929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1326929473cSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1336929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1346929473cSShri Abhyankar 
1356929473cSShri Abhyankar   /* forward solve the U^T */
1366929473cSShri Abhyankar   idx = 0;
1376929473cSShri Abhyankar   for (i=0; i<n; i++) {
1386929473cSShri Abhyankar     v     = aa + bs2*diag[i];
1396929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1406929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1416929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1426929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1436929473cSShri Abhyankar     v -= bs2;
1446929473cSShri Abhyankar 
1456929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
1466929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
1476929473cSShri Abhyankar     for(j=0;j>-nz;j--){
1486929473cSShri Abhyankar       oidx = bs*vi[j];
1496929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
1506929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
1516929473cSShri Abhyankar       v  -= bs2;
1526929473cSShri Abhyankar     }
1536929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
1546929473cSShri Abhyankar     idx += bs;
1556929473cSShri Abhyankar   }
1566929473cSShri Abhyankar   /* backward solve the L^T */
1576929473cSShri Abhyankar   for (i=n-1; i>=0; i--){
1586929473cSShri Abhyankar     v    = aa + bs2*ai[i];
1596929473cSShri Abhyankar     vi   = aj + ai[i];
1606929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
1616929473cSShri Abhyankar     idt  = bs*i;
1626929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
1636929473cSShri Abhyankar     for(j=0;j<nz;j++){
1646929473cSShri Abhyankar       idx   = bs*vi[j];
1656929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
1666929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
1676929473cSShri Abhyankar       v += bs2;
1686929473cSShri Abhyankar     }
1696929473cSShri Abhyankar   }
1706929473cSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1716929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1726929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1736929473cSShri Abhyankar   PetscFunctionReturn(0);
1746929473cSShri Abhyankar }
1756929473cSShri Abhyankar 
1766929473cSShri Abhyankar #undef __FUNCT__
1774a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
178dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
179f1af5d2fSBarry Smith {
180f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181dfbe8321SBarry Smith   PetscErrorCode ierr;
182690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
184f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18587828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
18687828ca2SBarry Smith   PetscScalar    *x,*b;
187f1af5d2fSBarry Smith 
188f1af5d2fSBarry Smith   PetscFunctionBegin;
189ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192f1af5d2fSBarry Smith 
193f1af5d2fSBarry Smith   /* forward solve the U^T */
194f1af5d2fSBarry Smith   idx = 0;
195f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
196f1af5d2fSBarry Smith 
197f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
198f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
199ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203f1af5d2fSBarry Smith     v += 9;
204f1af5d2fSBarry Smith 
205f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
206f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
207f1af5d2fSBarry Smith     while (nz--) {
208f1af5d2fSBarry Smith       oidx = 3*(*vi++);
209f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212f1af5d2fSBarry Smith       v  += 9;
213f1af5d2fSBarry Smith     }
214f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215f1af5d2fSBarry Smith     idx += 3;
216f1af5d2fSBarry Smith   }
217f1af5d2fSBarry Smith   /* backward solve the L^T */
218f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
219f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
220f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
221f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
222f1af5d2fSBarry Smith     idt  = 3*i;
223f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224f1af5d2fSBarry Smith     while (nz--) {
225f1af5d2fSBarry Smith       idx   = 3*(*vi--);
226f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229f1af5d2fSBarry Smith       v -= 9;
230f1af5d2fSBarry Smith     }
231f1af5d2fSBarry Smith   }
2321ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235f1af5d2fSBarry Smith   PetscFunctionReturn(0);
236f1af5d2fSBarry Smith }
237f1af5d2fSBarry Smith 
2384a2ae208SSatish Balay #undef __FUNCT__
239*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct"
240*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
241*8499736aSShri Abhyankar {
242*8499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
243*8499736aSShri Abhyankar   PetscErrorCode ierr;
244*8499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
245*8499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
246*8499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
247*8499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
248*8499736aSShri Abhyankar   PetscScalar    s1,s2,s3,x1,x2,x3;
249*8499736aSShri Abhyankar   PetscScalar    *x,*b;
250*8499736aSShri Abhyankar 
251*8499736aSShri Abhyankar   PetscFunctionBegin;
252*8499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253*8499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
254*8499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
255*8499736aSShri Abhyankar 
256*8499736aSShri Abhyankar   /* forward solve the U^T */
257*8499736aSShri Abhyankar   idx = 0;
258*8499736aSShri Abhyankar   for (i=0; i<n; i++) {
259*8499736aSShri Abhyankar     v     = aa + bs2*diag[i];
260*8499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
261*8499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
262*8499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
263*8499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
264*8499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
265*8499736aSShri Abhyankar     v -= bs2;
266*8499736aSShri Abhyankar 
267*8499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
268*8499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
269*8499736aSShri Abhyankar     for(j=0;j>-nz;j--){
270*8499736aSShri Abhyankar       oidx = bs*vi[j];
271*8499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
272*8499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
273*8499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
274*8499736aSShri Abhyankar       v  -= bs2;
275*8499736aSShri Abhyankar     }
276*8499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
277*8499736aSShri Abhyankar     idx += bs;
278*8499736aSShri Abhyankar   }
279*8499736aSShri Abhyankar   /* backward solve the L^T */
280*8499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
281*8499736aSShri Abhyankar     v    = aa + bs2*ai[i];
282*8499736aSShri Abhyankar     vi   = aj + ai[i];
283*8499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
284*8499736aSShri Abhyankar     idt  = bs*i;
285*8499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
286*8499736aSShri Abhyankar     for(j=0;j<nz;j++){
287*8499736aSShri Abhyankar       idx   = bs*vi[j];
288*8499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
289*8499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
290*8499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
291*8499736aSShri Abhyankar       v += bs2;
292*8499736aSShri Abhyankar     }
293*8499736aSShri Abhyankar   }
294*8499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
295*8499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
296*8499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
297*8499736aSShri Abhyankar   PetscFunctionReturn(0);
298*8499736aSShri Abhyankar }
299*8499736aSShri Abhyankar 
300*8499736aSShri Abhyankar #undef __FUNCT__
3014a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
302dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
303f1af5d2fSBarry Smith {
304f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305dfbe8321SBarry Smith   PetscErrorCode ierr;
306690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
308f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
30987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
31087828ca2SBarry Smith   PetscScalar    *x,*b;
311f1af5d2fSBarry Smith 
312f1af5d2fSBarry Smith   PetscFunctionBegin;
313ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3141ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316f1af5d2fSBarry Smith 
317f1af5d2fSBarry Smith   /* forward solve the U^T */
318f1af5d2fSBarry Smith   idx = 0;
319f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
322f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
323ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328f1af5d2fSBarry Smith     v += 16;
329f1af5d2fSBarry Smith 
330f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
331f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
332f1af5d2fSBarry Smith     while (nz--) {
333f1af5d2fSBarry Smith       oidx = 4*(*vi++);
334f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338f1af5d2fSBarry Smith       v  += 16;
339f1af5d2fSBarry Smith     }
340f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341f1af5d2fSBarry Smith     idx += 4;
342f1af5d2fSBarry Smith   }
343f1af5d2fSBarry Smith   /* backward solve the L^T */
344f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
345f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
346f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
347f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
348f1af5d2fSBarry Smith     idt  = 4*i;
349f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350f1af5d2fSBarry Smith     while (nz--) {
351f1af5d2fSBarry Smith       idx   = 4*(*vi--);
352f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356f1af5d2fSBarry Smith       v -= 16;
357f1af5d2fSBarry Smith     }
358f1af5d2fSBarry Smith   }
3591ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362f1af5d2fSBarry Smith   PetscFunctionReturn(0);
363f1af5d2fSBarry Smith }
364f1af5d2fSBarry Smith 
3654a2ae208SSatish Balay #undef __FUNCT__
366*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct"
367*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
368*8499736aSShri Abhyankar {
369*8499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
370*8499736aSShri Abhyankar   PetscErrorCode ierr;
371*8499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
372*8499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
373*8499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
374*8499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
375*8499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
376*8499736aSShri Abhyankar   PetscScalar    *x,*b;
377*8499736aSShri Abhyankar 
378*8499736aSShri Abhyankar   PetscFunctionBegin;
379*8499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380*8499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
381*8499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
382*8499736aSShri Abhyankar 
383*8499736aSShri Abhyankar   /* forward solve the U^T */
384*8499736aSShri Abhyankar   idx = 0;
385*8499736aSShri Abhyankar   for (i=0; i<n; i++) {
386*8499736aSShri Abhyankar     v     = aa + bs2*diag[i];
387*8499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
388*8499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
389*8499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
390*8499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
391*8499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
392*8499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
393*8499736aSShri Abhyankar     v -= bs2;
394*8499736aSShri Abhyankar 
395*8499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
396*8499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
397*8499736aSShri Abhyankar     for(j=0;j>-nz;j--){
398*8499736aSShri Abhyankar       oidx = bs*vi[j];
399*8499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
400*8499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
401*8499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
402*8499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
403*8499736aSShri Abhyankar       v  -= bs2;
404*8499736aSShri Abhyankar     }
405*8499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
406*8499736aSShri Abhyankar     idx += bs;
407*8499736aSShri Abhyankar   }
408*8499736aSShri Abhyankar   /* backward solve the L^T */
409*8499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
410*8499736aSShri Abhyankar     v    = aa + bs2*ai[i];
411*8499736aSShri Abhyankar     vi   = aj + ai[i];
412*8499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
413*8499736aSShri Abhyankar     idt  = bs*i;
414*8499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
415*8499736aSShri Abhyankar     for(j=0;j<nz;j++){
416*8499736aSShri Abhyankar       idx   = bs*vi[j];
417*8499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
418*8499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
419*8499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
420*8499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
421*8499736aSShri Abhyankar       v += bs2;
422*8499736aSShri Abhyankar     }
423*8499736aSShri Abhyankar   }
424*8499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
425*8499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
426*8499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
427*8499736aSShri Abhyankar   PetscFunctionReturn(0);
428*8499736aSShri Abhyankar }
429*8499736aSShri Abhyankar 
430*8499736aSShri Abhyankar #undef __FUNCT__
4314a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
432dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
433f1af5d2fSBarry Smith {
434f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
435dfbe8321SBarry Smith   PetscErrorCode ierr;
436690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
437690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
438f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
43987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
44087828ca2SBarry Smith   PetscScalar    *x,*b;
441f1af5d2fSBarry Smith 
442f1af5d2fSBarry Smith   PetscFunctionBegin;
443ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4441ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446f1af5d2fSBarry Smith 
447f1af5d2fSBarry Smith   /* forward solve the U^T */
448f1af5d2fSBarry Smith   idx = 0;
449f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
450f1af5d2fSBarry Smith 
451f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
452f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
453ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459f1af5d2fSBarry Smith     v += 25;
460f1af5d2fSBarry Smith 
461f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
462f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
463f1af5d2fSBarry Smith     while (nz--) {
464f1af5d2fSBarry Smith       oidx = 5*(*vi++);
465f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470f1af5d2fSBarry Smith       v  += 25;
471f1af5d2fSBarry Smith     }
472f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473f1af5d2fSBarry Smith     idx += 5;
474f1af5d2fSBarry Smith   }
475f1af5d2fSBarry Smith   /* backward solve the L^T */
476f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
477f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
478f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
479f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
480f1af5d2fSBarry Smith     idt  = 5*i;
481f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482f1af5d2fSBarry Smith     while (nz--) {
483f1af5d2fSBarry Smith       idx   = 5*(*vi--);
484f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489f1af5d2fSBarry Smith       v -= 25;
490f1af5d2fSBarry Smith     }
491f1af5d2fSBarry Smith   }
4921ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495f1af5d2fSBarry Smith   PetscFunctionReturn(0);
496f1af5d2fSBarry Smith }
497f1af5d2fSBarry Smith 
4984a2ae208SSatish Balay #undef __FUNCT__
499*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct"
500*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
501*8499736aSShri Abhyankar {
502*8499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
503*8499736aSShri Abhyankar   PetscErrorCode ierr;
504*8499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
505*8499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
506*8499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507*8499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
508*8499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
509*8499736aSShri Abhyankar   PetscScalar    *x,*b;
510*8499736aSShri Abhyankar 
511*8499736aSShri Abhyankar   PetscFunctionBegin;
512*8499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513*8499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
514*8499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
515*8499736aSShri Abhyankar 
516*8499736aSShri Abhyankar   /* forward solve the U^T */
517*8499736aSShri Abhyankar   idx = 0;
518*8499736aSShri Abhyankar   for (i=0; i<n; i++) {
519*8499736aSShri Abhyankar     v     = aa + bs2*diag[i];
520*8499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
521*8499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
522*8499736aSShri Abhyankar     x5 = x[4+idx];
523*8499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
524*8499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
525*8499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
526*8499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
527*8499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
528*8499736aSShri Abhyankar     v -= bs2;
529*8499736aSShri Abhyankar 
530*8499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
531*8499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
532*8499736aSShri Abhyankar     for(j=0;j>-nz;j--){
533*8499736aSShri Abhyankar       oidx = bs*vi[j];
534*8499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
535*8499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
536*8499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
537*8499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
538*8499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
539*8499736aSShri Abhyankar       v  -= bs2;
540*8499736aSShri Abhyankar     }
541*8499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
542*8499736aSShri Abhyankar     idx += bs;
543*8499736aSShri Abhyankar   }
544*8499736aSShri Abhyankar   /* backward solve the L^T */
545*8499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
546*8499736aSShri Abhyankar     v    = aa + bs2*ai[i];
547*8499736aSShri Abhyankar     vi   = aj + ai[i];
548*8499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
549*8499736aSShri Abhyankar     idt  = bs*i;
550*8499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
551*8499736aSShri Abhyankar     for(j=0;j<nz;j++){
552*8499736aSShri Abhyankar       idx   = bs*vi[j];
553*8499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
554*8499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
555*8499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
556*8499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
557*8499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
558*8499736aSShri Abhyankar       v += bs2;
559*8499736aSShri Abhyankar     }
560*8499736aSShri Abhyankar   }
561*8499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
562*8499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
563*8499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
564*8499736aSShri Abhyankar   PetscFunctionReturn(0);
565*8499736aSShri Abhyankar }
566*8499736aSShri Abhyankar 
567*8499736aSShri Abhyankar #undef __FUNCT__
5684a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
569dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
570f1af5d2fSBarry Smith {
571f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
572dfbe8321SBarry Smith   PetscErrorCode ierr;
573690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
574690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
575f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
57687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
57787828ca2SBarry Smith   PetscScalar    *x,*b;
578f1af5d2fSBarry Smith 
579f1af5d2fSBarry Smith   PetscFunctionBegin;
580ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5811ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583f1af5d2fSBarry Smith 
584f1af5d2fSBarry Smith   /* forward solve the U^T */
585f1af5d2fSBarry Smith   idx = 0;
586f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
587f1af5d2fSBarry Smith 
588f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
589f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
590ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591ef66eb69SBarry Smith     x6    = x[5+idx];
592f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598f1af5d2fSBarry Smith     v += 36;
599f1af5d2fSBarry Smith 
600f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
601f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
602f1af5d2fSBarry Smith     while (nz--) {
603f1af5d2fSBarry Smith       oidx = 6*(*vi++);
604f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610f1af5d2fSBarry Smith       v  += 36;
611f1af5d2fSBarry Smith     }
612f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613f1af5d2fSBarry Smith     x[5+idx] = s6;
614f1af5d2fSBarry Smith     idx += 6;
615f1af5d2fSBarry Smith   }
616f1af5d2fSBarry Smith   /* backward solve the L^T */
617f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
618f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
619f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
620f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
621f1af5d2fSBarry Smith     idt  = 6*i;
622f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623f1af5d2fSBarry Smith     s6 = x[5+idt];
624f1af5d2fSBarry Smith     while (nz--) {
625f1af5d2fSBarry Smith       idx   = 6*(*vi--);
626f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632f1af5d2fSBarry Smith       v -= 36;
633f1af5d2fSBarry Smith     }
634f1af5d2fSBarry Smith   }
6351ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6361ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638f1af5d2fSBarry Smith   PetscFunctionReturn(0);
639f1af5d2fSBarry Smith }
640f1af5d2fSBarry Smith 
6414a2ae208SSatish Balay #undef __FUNCT__
642*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct"
643*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
644*8499736aSShri Abhyankar {
645*8499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
646*8499736aSShri Abhyankar   PetscErrorCode ierr;
647*8499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
648*8499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
649*8499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
650*8499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
651*8499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
652*8499736aSShri Abhyankar   PetscScalar    *x,*b;
653*8499736aSShri Abhyankar 
654*8499736aSShri Abhyankar   PetscFunctionBegin;
655*8499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656*8499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
657*8499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
658*8499736aSShri Abhyankar 
659*8499736aSShri Abhyankar   /* forward solve the U^T */
660*8499736aSShri Abhyankar   idx = 0;
661*8499736aSShri Abhyankar   for (i=0; i<n; i++) {
662*8499736aSShri Abhyankar     v     = aa + bs2*diag[i];
663*8499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
664*8499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
665*8499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
666*8499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
667*8499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
668*8499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
669*8499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
670*8499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
671*8499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
672*8499736aSShri Abhyankar     v -= bs2;
673*8499736aSShri Abhyankar 
674*8499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
675*8499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
676*8499736aSShri Abhyankar     for(j=0;j>-nz;j--){
677*8499736aSShri Abhyankar       oidx = bs*vi[j];
678*8499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
679*8499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
680*8499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
681*8499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
682*8499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
683*8499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
684*8499736aSShri Abhyankar       v  -= bs2;
685*8499736aSShri Abhyankar     }
686*8499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
687*8499736aSShri Abhyankar     x[5+idx] = s6;
688*8499736aSShri Abhyankar     idx += bs;
689*8499736aSShri Abhyankar   }
690*8499736aSShri Abhyankar   /* backward solve the L^T */
691*8499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
692*8499736aSShri Abhyankar     v    = aa + bs2*ai[i];
693*8499736aSShri Abhyankar     vi   = aj + ai[i];
694*8499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
695*8499736aSShri Abhyankar     idt  = bs*i;
696*8499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
697*8499736aSShri Abhyankar     s6   = x[5+idt];
698*8499736aSShri Abhyankar     for(j=0;j<nz;j++){
699*8499736aSShri Abhyankar       idx   = bs*vi[j];
700*8499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
701*8499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
702*8499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
703*8499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
704*8499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
705*8499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
706*8499736aSShri Abhyankar       v += bs2;
707*8499736aSShri Abhyankar     }
708*8499736aSShri Abhyankar   }
709*8499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
710*8499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
711*8499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
712*8499736aSShri Abhyankar   PetscFunctionReturn(0);
713*8499736aSShri Abhyankar }
714*8499736aSShri Abhyankar 
715*8499736aSShri Abhyankar #undef __FUNCT__
7164a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
717dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
718f1af5d2fSBarry Smith {
719f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
720dfbe8321SBarry Smith   PetscErrorCode ierr;
721690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
722690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
723f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
72487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
72587828ca2SBarry Smith   PetscScalar    *x,*b;
726f1af5d2fSBarry Smith 
727f1af5d2fSBarry Smith   PetscFunctionBegin;
728ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7291ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731f1af5d2fSBarry Smith 
732f1af5d2fSBarry Smith   /* forward solve the U^T */
733f1af5d2fSBarry Smith   idx = 0;
734f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
735f1af5d2fSBarry Smith 
736f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
737f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
738ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
740f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747f1af5d2fSBarry Smith     v += 49;
748f1af5d2fSBarry Smith 
749f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
750f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
751f1af5d2fSBarry Smith     while (nz--) {
752f1af5d2fSBarry Smith       oidx = 7*(*vi++);
753f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760f1af5d2fSBarry Smith       v  += 49;
761f1af5d2fSBarry Smith     }
762f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
764f1af5d2fSBarry Smith     idx += 7;
765f1af5d2fSBarry Smith   }
766f1af5d2fSBarry Smith   /* backward solve the L^T */
767f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
768f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
769f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
770f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
771f1af5d2fSBarry Smith     idt  = 7*i;
772f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
774f1af5d2fSBarry Smith     while (nz--) {
775f1af5d2fSBarry Smith       idx   = 7*(*vi--);
776f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783f1af5d2fSBarry Smith       v -= 49;
784f1af5d2fSBarry Smith     }
785f1af5d2fSBarry Smith   }
7861ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7871ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789f1af5d2fSBarry Smith   PetscFunctionReturn(0);
790f1af5d2fSBarry Smith }
791*8499736aSShri Abhyankar #undef __FUNCT__
792*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct"
793*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
794*8499736aSShri Abhyankar {
795*8499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
796*8499736aSShri Abhyankar   PetscErrorCode ierr;
797*8499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
798*8499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
799*8499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
800*8499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
801*8499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
802*8499736aSShri Abhyankar   PetscScalar    *x,*b;
803*8499736aSShri Abhyankar 
804*8499736aSShri Abhyankar   PetscFunctionBegin;
805*8499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806*8499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
807*8499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
808*8499736aSShri Abhyankar 
809*8499736aSShri Abhyankar   /* forward solve the U^T */
810*8499736aSShri Abhyankar   idx = 0;
811*8499736aSShri Abhyankar   for (i=0; i<n; i++) {
812*8499736aSShri Abhyankar     v     = aa + bs2*diag[i];
813*8499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
814*8499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
815*8499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
816*8499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
817*8499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
818*8499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
819*8499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
820*8499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
821*8499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
822*8499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
823*8499736aSShri Abhyankar     v -= bs2;
824*8499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
825*8499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
826*8499736aSShri Abhyankar     for(j=0;j>-nz;j--){
827*8499736aSShri Abhyankar       oidx = bs*vi[j];
828*8499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
829*8499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
830*8499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
831*8499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
832*8499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
833*8499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
834*8499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
835*8499736aSShri Abhyankar       v  -= bs2;
836*8499736aSShri Abhyankar     }
837*8499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
838*8499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
839*8499736aSShri Abhyankar     idx += bs;
840*8499736aSShri Abhyankar   }
841*8499736aSShri Abhyankar   /* backward solve the L^T */
842*8499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
843*8499736aSShri Abhyankar     v    = aa + bs2*ai[i];
844*8499736aSShri Abhyankar     vi   = aj + ai[i];
845*8499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
846*8499736aSShri Abhyankar     idt  = bs*i;
847*8499736aSShri Abhyankar     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
848*8499736aSShri Abhyankar     s6   = x[5+idt];  s7 = x[6+idt];
849*8499736aSShri Abhyankar     for(j=0;j<nz;j++){
850*8499736aSShri Abhyankar       idx   = bs*vi[j];
851*8499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
852*8499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
853*8499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
854*8499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
855*8499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
856*8499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
857*8499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
858*8499736aSShri Abhyankar       v += bs2;
859*8499736aSShri Abhyankar     }
860*8499736aSShri Abhyankar   }
861*8499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
862*8499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
863*8499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
864*8499736aSShri Abhyankar   PetscFunctionReturn(0);
865*8499736aSShri Abhyankar }
866f1af5d2fSBarry Smith 
867f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
8684a2ae208SSatish Balay #undef __FUNCT__
8694a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1"
870dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
871f1af5d2fSBarry Smith {
872f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
873f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8746849ba73SBarry Smith   PetscErrorCode ierr;
8755d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8765d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
877690b6cddSBarry Smith   PetscInt       *diag = a->diag;
878f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
87987828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
880f1af5d2fSBarry Smith 
881f1af5d2fSBarry Smith   PetscFunctionBegin;
8821ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
884f1af5d2fSBarry Smith   t  = a->solve_work;
885f1af5d2fSBarry Smith 
886f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
887f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
890f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
891f1af5d2fSBarry Smith     t[i] = b[c[i]];
892f1af5d2fSBarry Smith   }
893f1af5d2fSBarry Smith 
894f1af5d2fSBarry Smith   /* forward solve the U^T */
895f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
896f1af5d2fSBarry Smith 
897f1af5d2fSBarry Smith     v     = aa + diag[i];
898f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
899f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
900f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
901f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
902f1af5d2fSBarry Smith     while (nz--) {
903f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
904f1af5d2fSBarry Smith     }
905f1af5d2fSBarry Smith     t[i]   = s1;
906f1af5d2fSBarry Smith   }
907f1af5d2fSBarry Smith   /* backward solve the L^T */
908f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
909f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
910f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
911f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
912f1af5d2fSBarry Smith     s1   = t[i];
913f1af5d2fSBarry Smith     while (nz--) {
914f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
915f1af5d2fSBarry Smith     }
916f1af5d2fSBarry Smith   }
917f1af5d2fSBarry Smith 
918f1af5d2fSBarry Smith   /* copy t into x according to permutation */
919f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
920f1af5d2fSBarry Smith     x[r[i]]   = t[i];
921f1af5d2fSBarry Smith   }
922f1af5d2fSBarry Smith 
923f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
924f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9251ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9261ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
927dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
928f1af5d2fSBarry Smith   PetscFunctionReturn(0);
929f1af5d2fSBarry Smith }
930f1af5d2fSBarry Smith 
9314a2ae208SSatish Balay #undef __FUNCT__
9324a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
933dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
934f1af5d2fSBarry Smith {
935f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
936f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9376849ba73SBarry Smith   PetscErrorCode ierr;
9385d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9395d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
940690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
941f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
94287828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
94387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
944f1af5d2fSBarry Smith 
945f1af5d2fSBarry Smith   PetscFunctionBegin;
9461ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9471ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
948f1af5d2fSBarry Smith   t  = a->solve_work;
949f1af5d2fSBarry Smith 
950f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
951f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
952f1af5d2fSBarry Smith 
953f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
954f1af5d2fSBarry Smith   ii = 0;
955f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
956f1af5d2fSBarry Smith     ic      = 2*c[i];
957f1af5d2fSBarry Smith     t[ii]   = b[ic];
958f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
959f1af5d2fSBarry Smith     ii += 2;
960f1af5d2fSBarry Smith   }
961f1af5d2fSBarry Smith 
962f1af5d2fSBarry Smith   /* forward solve the U^T */
963f1af5d2fSBarry Smith   idx = 0;
964f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
965f1af5d2fSBarry Smith 
966f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
967f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
968f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
969f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
970f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
971f1af5d2fSBarry Smith     v += 4;
972f1af5d2fSBarry Smith 
973f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
974f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
975f1af5d2fSBarry Smith     while (nz--) {
976f1af5d2fSBarry Smith       oidx = 2*(*vi++);
977f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
978f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
979f1af5d2fSBarry Smith       v  += 4;
980f1af5d2fSBarry Smith     }
981f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
982f1af5d2fSBarry Smith     idx += 2;
983f1af5d2fSBarry Smith   }
984f1af5d2fSBarry Smith   /* backward solve the L^T */
985f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
986f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
987f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
988f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
989f1af5d2fSBarry Smith     idt  = 2*i;
990f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
991f1af5d2fSBarry Smith     while (nz--) {
992f1af5d2fSBarry Smith       idx   = 2*(*vi--);
993f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
994f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
995f1af5d2fSBarry Smith       v -= 4;
996f1af5d2fSBarry Smith     }
997f1af5d2fSBarry Smith   }
998f1af5d2fSBarry Smith 
999f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1000f1af5d2fSBarry Smith   ii = 0;
1001f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1002f1af5d2fSBarry Smith     ir      = 2*r[i];
1003f1af5d2fSBarry Smith     x[ir]   = t[ii];
1004f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1005f1af5d2fSBarry Smith     ii += 2;
1006f1af5d2fSBarry Smith   }
1007f1af5d2fSBarry Smith 
1008f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1009f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10101ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1012dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1013f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1014f1af5d2fSBarry Smith }
1015f1af5d2fSBarry Smith 
10164a2ae208SSatish Balay #undef __FUNCT__
10174a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
1018dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
1019f1af5d2fSBarry Smith {
1020f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1021f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
10226849ba73SBarry Smith   PetscErrorCode ierr;
10235d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
10245d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1025690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1026f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
102787828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
102887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1029f1af5d2fSBarry Smith 
1030f1af5d2fSBarry Smith   PetscFunctionBegin;
10311ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
10321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1033f1af5d2fSBarry Smith   t  = a->solve_work;
1034f1af5d2fSBarry Smith 
1035f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1036f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1037f1af5d2fSBarry Smith 
1038f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1039f1af5d2fSBarry Smith   ii = 0;
1040f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1041f1af5d2fSBarry Smith     ic      = 3*c[i];
1042f1af5d2fSBarry Smith     t[ii]   = b[ic];
1043f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1044f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1045f1af5d2fSBarry Smith     ii += 3;
1046f1af5d2fSBarry Smith   }
1047f1af5d2fSBarry Smith 
1048f1af5d2fSBarry Smith   /* forward solve the U^T */
1049f1af5d2fSBarry Smith   idx = 0;
1050f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1051f1af5d2fSBarry Smith 
1052f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
1053f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1054f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1055f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1056f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1057f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1058f1af5d2fSBarry Smith     v += 9;
1059f1af5d2fSBarry Smith 
1060f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1061f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1062f1af5d2fSBarry Smith     while (nz--) {
1063f1af5d2fSBarry Smith       oidx = 3*(*vi++);
1064f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1065f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1066f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1067f1af5d2fSBarry Smith       v  += 9;
1068f1af5d2fSBarry Smith     }
1069f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1070f1af5d2fSBarry Smith     idx += 3;
1071f1af5d2fSBarry Smith   }
1072f1af5d2fSBarry Smith   /* backward solve the L^T */
1073f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1074f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
1075f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1076f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1077f1af5d2fSBarry Smith     idt  = 3*i;
1078f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1079f1af5d2fSBarry Smith     while (nz--) {
1080f1af5d2fSBarry Smith       idx   = 3*(*vi--);
1081f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1082f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1083f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1084f1af5d2fSBarry Smith       v -= 9;
1085f1af5d2fSBarry Smith     }
1086f1af5d2fSBarry Smith   }
1087f1af5d2fSBarry Smith 
1088f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1089f1af5d2fSBarry Smith   ii = 0;
1090f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1091f1af5d2fSBarry Smith     ir      = 3*r[i];
1092f1af5d2fSBarry Smith     x[ir]   = t[ii];
1093f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1094f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1095f1af5d2fSBarry Smith     ii += 3;
1096f1af5d2fSBarry Smith   }
1097f1af5d2fSBarry Smith 
1098f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1099f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11001ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11011ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1102dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1103f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1104f1af5d2fSBarry Smith }
1105f1af5d2fSBarry Smith 
11064a2ae208SSatish Balay #undef __FUNCT__
11074a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
1108dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
1109f1af5d2fSBarry Smith {
1110f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1111f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
11126849ba73SBarry Smith   PetscErrorCode ierr;
11135d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
11145d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1115690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1116f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
111787828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
111887828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1119f1af5d2fSBarry Smith 
1120f1af5d2fSBarry Smith   PetscFunctionBegin;
11211ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1123f1af5d2fSBarry Smith   t  = a->solve_work;
1124f1af5d2fSBarry Smith 
1125f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1126f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1127f1af5d2fSBarry Smith 
1128f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1129f1af5d2fSBarry Smith   ii = 0;
1130f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1131f1af5d2fSBarry Smith     ic      = 4*c[i];
1132f1af5d2fSBarry Smith     t[ii]   = b[ic];
1133f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1134f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1135f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1136f1af5d2fSBarry Smith     ii += 4;
1137f1af5d2fSBarry Smith   }
1138f1af5d2fSBarry Smith 
1139f1af5d2fSBarry Smith   /* forward solve the U^T */
1140f1af5d2fSBarry Smith   idx = 0;
1141f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1142f1af5d2fSBarry Smith 
1143f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
1144f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1145f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1146f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1147f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1148f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1149f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1150f1af5d2fSBarry Smith     v += 16;
1151f1af5d2fSBarry Smith 
1152f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1153f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1154f1af5d2fSBarry Smith     while (nz--) {
1155f1af5d2fSBarry Smith       oidx = 4*(*vi++);
1156f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1157f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1158f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1159f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1160f1af5d2fSBarry Smith       v  += 16;
1161f1af5d2fSBarry Smith     }
1162f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1163f1af5d2fSBarry Smith     idx += 4;
1164f1af5d2fSBarry Smith   }
1165f1af5d2fSBarry Smith   /* backward solve the L^T */
1166f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1167f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
1168f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1169f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1170f1af5d2fSBarry Smith     idt  = 4*i;
1171f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1172f1af5d2fSBarry Smith     while (nz--) {
1173f1af5d2fSBarry Smith       idx   = 4*(*vi--);
1174f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1175f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1176f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1177f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1178f1af5d2fSBarry Smith       v -= 16;
1179f1af5d2fSBarry Smith     }
1180f1af5d2fSBarry Smith   }
1181f1af5d2fSBarry Smith 
1182f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1183f1af5d2fSBarry Smith   ii = 0;
1184f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1185f1af5d2fSBarry Smith     ir      = 4*r[i];
1186f1af5d2fSBarry Smith     x[ir]   = t[ii];
1187f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1188f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1189f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1190f1af5d2fSBarry Smith     ii += 4;
1191f1af5d2fSBarry Smith   }
1192f1af5d2fSBarry Smith 
1193f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1194f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11951ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11961ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1197dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1198f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1199f1af5d2fSBarry Smith }
1200f1af5d2fSBarry Smith 
12014a2ae208SSatish Balay #undef __FUNCT__
12024a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
1203dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
1204f1af5d2fSBarry Smith {
1205f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1206f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
12076849ba73SBarry Smith   PetscErrorCode ierr;
12085d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
12095d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1210690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1211f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
121287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
121387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1214f1af5d2fSBarry Smith 
1215f1af5d2fSBarry Smith   PetscFunctionBegin;
12161ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12171ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1218f1af5d2fSBarry Smith   t  = a->solve_work;
1219f1af5d2fSBarry Smith 
1220f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1221f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1222f1af5d2fSBarry Smith 
1223f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1224f1af5d2fSBarry Smith   ii = 0;
1225f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1226f1af5d2fSBarry Smith     ic      = 5*c[i];
1227f1af5d2fSBarry Smith     t[ii]   = b[ic];
1228f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1229f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1230f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1231f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1232f1af5d2fSBarry Smith     ii += 5;
1233f1af5d2fSBarry Smith   }
1234f1af5d2fSBarry Smith 
1235f1af5d2fSBarry Smith   /* forward solve the U^T */
1236f1af5d2fSBarry Smith   idx = 0;
1237f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1238f1af5d2fSBarry Smith 
1239f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
1240f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1241f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1242f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1243f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1244f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1245f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1246f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1247f1af5d2fSBarry Smith     v += 25;
1248f1af5d2fSBarry Smith 
1249f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1250f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1251f1af5d2fSBarry Smith     while (nz--) {
1252f1af5d2fSBarry Smith       oidx = 5*(*vi++);
1253f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1254f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1255f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1256f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1257f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1258f1af5d2fSBarry Smith       v  += 25;
1259f1af5d2fSBarry Smith     }
1260f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1261f1af5d2fSBarry Smith     idx += 5;
1262f1af5d2fSBarry Smith   }
1263f1af5d2fSBarry Smith   /* backward solve the L^T */
1264f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1265f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
1266f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1267f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1268f1af5d2fSBarry Smith     idt  = 5*i;
1269f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1270f1af5d2fSBarry Smith     while (nz--) {
1271f1af5d2fSBarry Smith       idx   = 5*(*vi--);
1272f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1273f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1274f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1275f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1276f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1277f1af5d2fSBarry Smith       v -= 25;
1278f1af5d2fSBarry Smith     }
1279f1af5d2fSBarry Smith   }
1280f1af5d2fSBarry Smith 
1281f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1282f1af5d2fSBarry Smith   ii = 0;
1283f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1284f1af5d2fSBarry Smith     ir      = 5*r[i];
1285f1af5d2fSBarry Smith     x[ir]   = t[ii];
1286f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1287f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1288f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1289f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1290f1af5d2fSBarry Smith     ii += 5;
1291f1af5d2fSBarry Smith   }
1292f1af5d2fSBarry Smith 
1293f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1294f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
12951ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
12961ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1297dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1298f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1299f1af5d2fSBarry Smith }
1300f1af5d2fSBarry Smith 
13014a2ae208SSatish Balay #undef __FUNCT__
13024a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
1303dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
1304f1af5d2fSBarry Smith {
1305f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1306f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
13076849ba73SBarry Smith   PetscErrorCode ierr;
13085d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
13095d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1310690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1311f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
131287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
131387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1314f1af5d2fSBarry Smith 
1315f1af5d2fSBarry Smith   PetscFunctionBegin;
13161ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
13171ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1318f1af5d2fSBarry Smith   t  = a->solve_work;
1319f1af5d2fSBarry Smith 
1320f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1321f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1322f1af5d2fSBarry Smith 
1323f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1324f1af5d2fSBarry Smith   ii = 0;
1325f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1326f1af5d2fSBarry Smith     ic      = 6*c[i];
1327f1af5d2fSBarry Smith     t[ii]   = b[ic];
1328f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1329f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1330f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1331f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1332f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1333f1af5d2fSBarry Smith     ii += 6;
1334f1af5d2fSBarry Smith   }
1335f1af5d2fSBarry Smith 
1336f1af5d2fSBarry Smith   /* forward solve the U^T */
1337f1af5d2fSBarry Smith   idx = 0;
1338f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1339f1af5d2fSBarry Smith 
1340f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
1341f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1342f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1343f1af5d2fSBarry Smith     x6    = t[5+idx];
1344f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1345f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1346f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1347f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1348f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1349f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1350f1af5d2fSBarry Smith     v += 36;
1351f1af5d2fSBarry Smith 
1352f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1353f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1354f1af5d2fSBarry Smith     while (nz--) {
1355f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1356f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1357f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1358f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1359f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1360f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1361f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1362f1af5d2fSBarry Smith       v  += 36;
1363f1af5d2fSBarry Smith     }
1364f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1365f1af5d2fSBarry Smith     t[5+idx] = s6;
1366f1af5d2fSBarry Smith     idx += 6;
1367f1af5d2fSBarry Smith   }
1368f1af5d2fSBarry Smith   /* backward solve the L^T */
1369f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1370f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1371f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1372f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1373f1af5d2fSBarry Smith     idt  = 6*i;
1374f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1375f1af5d2fSBarry Smith     s6 = t[5+idt];
1376f1af5d2fSBarry Smith     while (nz--) {
1377f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1378f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1379f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1380f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1381f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1382f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1383f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1384f1af5d2fSBarry Smith       v -= 36;
1385f1af5d2fSBarry Smith     }
1386f1af5d2fSBarry Smith   }
1387f1af5d2fSBarry Smith 
1388f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1389f1af5d2fSBarry Smith   ii = 0;
1390f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1391f1af5d2fSBarry Smith     ir      = 6*r[i];
1392f1af5d2fSBarry Smith     x[ir]   = t[ii];
1393f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1394f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1395f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1396f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1397f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1398f1af5d2fSBarry Smith     ii += 6;
1399f1af5d2fSBarry Smith   }
1400f1af5d2fSBarry Smith 
1401f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1402f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
14031ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
14041ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1405dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1406f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1407f1af5d2fSBarry Smith }
1408f1af5d2fSBarry Smith 
14094a2ae208SSatish Balay #undef __FUNCT__
14104a2ae208SSatish Balay #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
1411dfbe8321SBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
1412f1af5d2fSBarry Smith {
1413f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1414f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
14156849ba73SBarry Smith   PetscErrorCode ierr;
14165d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
14175d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1418690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1419f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
142087828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
142187828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1422f1af5d2fSBarry Smith 
1423f1af5d2fSBarry Smith   PetscFunctionBegin;
14241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
14251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1426f1af5d2fSBarry Smith   t  = a->solve_work;
1427f1af5d2fSBarry Smith 
1428f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1429f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1430f1af5d2fSBarry Smith 
1431f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1432f1af5d2fSBarry Smith   ii = 0;
1433f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1434f1af5d2fSBarry Smith     ic      = 7*c[i];
1435f1af5d2fSBarry Smith     t[ii]   = b[ic];
1436f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1437f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1438f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1439f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1440f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1441f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1442f1af5d2fSBarry Smith     ii += 7;
1443f1af5d2fSBarry Smith   }
1444f1af5d2fSBarry Smith 
1445f1af5d2fSBarry Smith   /* forward solve the U^T */
1446f1af5d2fSBarry Smith   idx = 0;
1447f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1448f1af5d2fSBarry Smith 
1449f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1450f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1451f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1452f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1453f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1454f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1455f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1456f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1457f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1458f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1459f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1460f1af5d2fSBarry Smith     v += 49;
1461f1af5d2fSBarry Smith 
1462f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1463f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1464f1af5d2fSBarry Smith     while (nz--) {
1465f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1466f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1467f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1468f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1469f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1470f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1471f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1472f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1473f1af5d2fSBarry Smith       v  += 49;
1474f1af5d2fSBarry Smith     }
1475f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1476f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1477f1af5d2fSBarry Smith     idx += 7;
1478f1af5d2fSBarry Smith   }
1479f1af5d2fSBarry Smith   /* backward solve the L^T */
1480f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1481f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1482f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1483f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1484f1af5d2fSBarry Smith     idt  = 7*i;
1485f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1486f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1487f1af5d2fSBarry Smith     while (nz--) {
1488f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1489f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1490f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1491f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1492f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1493f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1494f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1495f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1496f1af5d2fSBarry Smith       v -= 49;
1497f1af5d2fSBarry Smith     }
1498f1af5d2fSBarry Smith   }
1499f1af5d2fSBarry Smith 
1500f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1501f1af5d2fSBarry Smith   ii = 0;
1502f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1503f1af5d2fSBarry Smith     ir      = 7*r[i];
1504f1af5d2fSBarry Smith     x[ir]   = t[ii];
1505f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1506f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1507f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1508f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1509f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1510f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1511f1af5d2fSBarry Smith     ii += 7;
1512f1af5d2fSBarry Smith   }
1513f1af5d2fSBarry Smith 
1514f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1515f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15161ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
15171ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1518dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1519f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1520f1af5d2fSBarry Smith }
1521f1af5d2fSBarry Smith 
15224e2b4712SSatish Balay /* ----------------------------------------------------------- */
15234a2ae208SSatish Balay #undef __FUNCT__
15244a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_N"
1525dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
15264e2b4712SSatish Balay {
15274e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
15284e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
15296849ba73SBarry Smith   PetscErrorCode ierr;
15305d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
15315d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
15325d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
15333f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
153487828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
15354e2b4712SSatish Balay 
15364e2b4712SSatish Balay   PetscFunctionBegin;
15371ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
15381ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1539f1af5d2fSBarry Smith   t  = a->solve_work;
15404e2b4712SSatish Balay 
15414e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
15424e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
15434e2b4712SSatish Balay 
15444e2b4712SSatish Balay   /* forward solve the lower triangular */
154587828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
15464e2b4712SSatish Balay   for (i=1; i<n; i++) {
15474e2b4712SSatish Balay     v   = aa + bs2*ai[i];
15484e2b4712SSatish Balay     vi  = aj + ai[i];
15494e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
1550f1af5d2fSBarry Smith     s = t + bs*i;
155187828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
15524e2b4712SSatish Balay     while (nz--) {
1553f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
15544e2b4712SSatish Balay       v += bs2;
15554e2b4712SSatish Balay     }
15564e2b4712SSatish Balay   }
15574e2b4712SSatish Balay   /* backward solve the upper triangular */
1558d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
15594e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
15604e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
15614e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
15624e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
156387828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
15644e2b4712SSatish Balay     while (nz--) {
1565f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
15664e2b4712SSatish Balay       v += bs2;
15674e2b4712SSatish Balay     }
1568f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
156987828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
15704e2b4712SSatish Balay   }
15714e2b4712SSatish Balay 
15724e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
15734e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15741ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
15751ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1576dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
15774e2b4712SSatish Balay   PetscFunctionReturn(0);
15784e2b4712SSatish Balay }
15794e2b4712SSatish Balay 
15805c42ef9dSBarry Smith /* ----------------------------------------------------------- */
15815c42ef9dSBarry Smith #undef __FUNCT__
15825c42ef9dSBarry Smith #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
15835c42ef9dSBarry Smith PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
15845c42ef9dSBarry Smith {
15855c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
15865c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
15875c42ef9dSBarry Smith   PetscErrorCode    ierr;
15885c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
15895c42ef9dSBarry Smith   PetscInt          i,n=a->mbs,j;
15905c42ef9dSBarry Smith   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
15915c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
15925c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
15935c42ef9dSBarry Smith   const PetscScalar *b;
15945c42ef9dSBarry Smith   PetscFunctionBegin;
15955c42ef9dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15965c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
15975c42ef9dSBarry Smith   t    = a->solve_work;
15985c42ef9dSBarry Smith 
15995c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
16005c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
16015c42ef9dSBarry Smith 
16025c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
16035c42ef9dSBarry Smith   for (i=0; i<n; i++) {
16045c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
16055c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
16065c42ef9dSBarry Smith     }
16075c42ef9dSBarry Smith   }
16085c42ef9dSBarry Smith 
16095c42ef9dSBarry Smith 
16105c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
16115c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
16125c42ef9dSBarry Smith   for (i=0; i<n; i++){
16135c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
16145c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
16155c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
16165c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
16175c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
16185c42ef9dSBarry Smith     while (nz--) {
16195c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
16205c42ef9dSBarry Smith       v += bs2;
16215c42ef9dSBarry Smith     }
16225c42ef9dSBarry Smith   }
16235c42ef9dSBarry Smith 
16245c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
16255c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
16265c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
16275c42ef9dSBarry Smith     vi  = aj + ai[i];
16285c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
16295c42ef9dSBarry Smith     while (nz--) {
16305c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
16315c42ef9dSBarry Smith       v += bs2;
16325c42ef9dSBarry Smith     }
16335c42ef9dSBarry Smith   }
16345c42ef9dSBarry Smith 
16355c42ef9dSBarry Smith   /* copy t into x according to permutation */
16365c42ef9dSBarry Smith   for (i=0; i<n; i++) {
16375c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
16385c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
16395c42ef9dSBarry Smith     }
16405c42ef9dSBarry Smith   }
16415c42ef9dSBarry Smith 
16425c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
16435c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
16445c42ef9dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16455c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
16465c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
16475c42ef9dSBarry Smith   PetscFunctionReturn(0);
16485c42ef9dSBarry Smith }
16495c42ef9dSBarry Smith 
16504a2ae208SSatish Balay #undef __FUNCT__
1651*8499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct"
1652*8499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx)
1653*8499736aSShri Abhyankar {
1654*8499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1655*8499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1656*8499736aSShri Abhyankar   PetscErrorCode    ierr;
1657*8499736aSShri Abhyankar   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
1658*8499736aSShri Abhyankar   PetscInt          i,n=a->mbs,j;
1659*8499736aSShri Abhyankar   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
1660*8499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
1661*8499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
1662*8499736aSShri Abhyankar   const PetscScalar *b;
1663*8499736aSShri Abhyankar   PetscFunctionBegin;
1664*8499736aSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1665*8499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1666*8499736aSShri Abhyankar   t    = a->solve_work;
1667*8499736aSShri Abhyankar 
1668*8499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1669*8499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1670*8499736aSShri Abhyankar 
1671*8499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
1672*8499736aSShri Abhyankar   for (i=0; i<n; i++) {
1673*8499736aSShri Abhyankar     for (j=0; j<bs; j++) {
1674*8499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
1675*8499736aSShri Abhyankar     }
1676*8499736aSShri Abhyankar   }
1677*8499736aSShri Abhyankar 
1678*8499736aSShri Abhyankar 
1679*8499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
1680*8499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
1681*8499736aSShri Abhyankar   for (i=0; i<n; i++){
1682*8499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
1683*8499736aSShri Abhyankar     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
1684*8499736aSShri Abhyankar     v   = aa + bs2*(diag[i] - 1);
1685*8499736aSShri Abhyankar     vi  = aj + diag[i] - 1;
1686*8499736aSShri Abhyankar     nz  = diag[i] - diag[i+1] - 1;
1687*8499736aSShri Abhyankar     for(j=0;j>-nz;j--){
1688*8499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1689*8499736aSShri Abhyankar       v -= bs2;
1690*8499736aSShri Abhyankar     }
1691*8499736aSShri Abhyankar   }
1692*8499736aSShri Abhyankar 
1693*8499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
1694*8499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
1695*8499736aSShri Abhyankar     v   = aa + bs2*ai[i];
1696*8499736aSShri Abhyankar     vi  = aj + ai[i];
1697*8499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
1698*8499736aSShri Abhyankar     for(j=0;j<nz;j++){
1699*8499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
1700*8499736aSShri Abhyankar       v += bs2;
1701*8499736aSShri Abhyankar     }
1702*8499736aSShri Abhyankar   }
1703*8499736aSShri Abhyankar 
1704*8499736aSShri Abhyankar   /* copy t into x according to permutation */
1705*8499736aSShri Abhyankar   for (i=0; i<n; i++) {
1706*8499736aSShri Abhyankar     for (j=0; j<bs; j++) {
1707*8499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
1708*8499736aSShri Abhyankar     }
1709*8499736aSShri Abhyankar   }
1710*8499736aSShri Abhyankar 
1711*8499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1712*8499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1713*8499736aSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1714*8499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1715*8499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
1716*8499736aSShri Abhyankar   PetscFunctionReturn(0);
1717*8499736aSShri Abhyankar }
1718*8499736aSShri Abhyankar 
1719*8499736aSShri Abhyankar #undef __FUNCT__
17204a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7"
1721dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
17224e2b4712SSatish Balay {
17234e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
17244e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
17256849ba73SBarry Smith   PetscErrorCode ierr;
17265d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
17275d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
17283f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
172987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
173087828ca2SBarry Smith   PetscScalar    *x,*b,*t;
17314e2b4712SSatish Balay 
17324e2b4712SSatish Balay   PetscFunctionBegin;
17331ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
17341ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1735f1af5d2fSBarry Smith   t  = a->solve_work;
17364e2b4712SSatish Balay 
17374e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
17384e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
17394e2b4712SSatish Balay 
17404e2b4712SSatish Balay   /* forward solve the lower triangular */
17414e2b4712SSatish Balay   idx    = 7*(*r++);
1742f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
1743f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
1744f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
17454e2b4712SSatish Balay 
17464e2b4712SSatish Balay   for (i=1; i<n; i++) {
17474e2b4712SSatish Balay     v     = aa + 49*ai[i];
17484e2b4712SSatish Balay     vi    = aj + ai[i];
17494e2b4712SSatish Balay     nz    = diag[i] - ai[i];
17504e2b4712SSatish Balay     idx   = 7*(*r++);
1751f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
1752f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
17534e2b4712SSatish Balay     while (nz--) {
17544e2b4712SSatish Balay       idx   = 7*(*vi++);
1755f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
1756f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
1757f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
1758f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1759f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1760f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1761f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1762f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1763f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1764f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
17654e2b4712SSatish Balay       v += 49;
17664e2b4712SSatish Balay     }
17674e2b4712SSatish Balay     idx = 7*i;
1768f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
1769f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1770f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
17714e2b4712SSatish Balay   }
17724e2b4712SSatish Balay   /* backward solve the upper triangular */
17734e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
17744e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
17754e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
17764e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
17774e2b4712SSatish Balay     idt  = 7*i;
1778f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
1779f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1780f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
17814e2b4712SSatish Balay     while (nz--) {
17824e2b4712SSatish Balay       idx   = 7*(*vi++);
1783f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
1784f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1785f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
1786f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1787f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1788f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1789f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1790f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1791f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1792f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
17934e2b4712SSatish Balay       v += 49;
17944e2b4712SSatish Balay     }
17954e2b4712SSatish Balay     idc = 7*(*c--);
17964e2b4712SSatish Balay     v   = aa + 49*diag[i];
1797f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
1798f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
1799f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
1800f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
1801f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
1802f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
1803f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
1804f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
1805f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
1806f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
1807f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
1808f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
1809f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
1810f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
18114e2b4712SSatish Balay   }
18124e2b4712SSatish Balay 
18134e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
18144e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
18151ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
18161ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1817dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
18184e2b4712SSatish Balay   PetscFunctionReturn(0);
18194e2b4712SSatish Balay }
18204e2b4712SSatish Balay 
18218f690400SShri Abhyankar #undef __FUNCT__
1822a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
1823a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
182435aa4fcfSShri Abhyankar {
182535aa4fcfSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
182635aa4fcfSShri Abhyankar   IS             iscol=a->col,isrow=a->row;
182735aa4fcfSShri Abhyankar   PetscErrorCode ierr;
182835aa4fcfSShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
182935aa4fcfSShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
183035aa4fcfSShri Abhyankar   MatScalar      *aa=a->a,*v;
183135aa4fcfSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
183235aa4fcfSShri Abhyankar   PetscScalar    *x,*b,*t;
183335aa4fcfSShri Abhyankar 
183435aa4fcfSShri Abhyankar   PetscFunctionBegin;
183535aa4fcfSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
183635aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
183735aa4fcfSShri Abhyankar   t  = a->solve_work;
183835aa4fcfSShri Abhyankar 
183935aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
184035aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
184135aa4fcfSShri Abhyankar 
184235aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
184335aa4fcfSShri Abhyankar   idx    = 7*r[0];
184435aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
184535aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
184635aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
184735aa4fcfSShri Abhyankar 
184835aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
184935aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
185035aa4fcfSShri Abhyankar     vi    = aj + ai[i];
185135aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
185235aa4fcfSShri Abhyankar     idx   = 7*r[i];
185335aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
185435aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
185535aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
185635aa4fcfSShri Abhyankar       idx   = 7*vi[m];
185735aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
185835aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
185935aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
186035aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
186135aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
186235aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
186335aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
186435aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
186535aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
186635aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
186735aa4fcfSShri Abhyankar       v += 49;
186835aa4fcfSShri Abhyankar     }
186935aa4fcfSShri Abhyankar     idx = 7*i;
187035aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
187135aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
187235aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
187335aa4fcfSShri Abhyankar   }
187435aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
187535aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
187635aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
187735aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
187835aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
187935aa4fcfSShri Abhyankar     idt  = 7*i;
188035aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
188135aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
188235aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
188335aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
188435aa4fcfSShri Abhyankar       idx   = 7*vi[m];
188535aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
188635aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
188735aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
188835aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
188935aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
189035aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
189135aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
189235aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
189335aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
189435aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
189535aa4fcfSShri Abhyankar       v += 49;
189635aa4fcfSShri Abhyankar     }
189735aa4fcfSShri Abhyankar     idc = 7*c[i];
189835aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
189935aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
190035aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
190135aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
190235aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
190335aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
190435aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
190535aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
190635aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
190735aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
190835aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
190935aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
191035aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
191135aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
191235aa4fcfSShri Abhyankar   }
191335aa4fcfSShri Abhyankar 
191435aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
191535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
191635aa4fcfSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
191735aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
191835aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
191935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
192035aa4fcfSShri Abhyankar }
192135aa4fcfSShri Abhyankar 
192235aa4fcfSShri Abhyankar #undef __FUNCT__
19234a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
1924dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
192515091d37SBarry Smith {
192615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
1927690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1928dfbe8321SBarry Smith   PetscErrorCode    ierr;
1929690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
1930d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
1931d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
1932d9fead3dSBarry Smith   const PetscScalar *b;
193315091d37SBarry Smith 
193415091d37SBarry Smith   PetscFunctionBegin;
1935d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19361ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
193715091d37SBarry Smith   /* forward solve the lower triangular */
193815091d37SBarry Smith   idx    = 0;
193915091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
194015091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
194115091d37SBarry Smith   x[6] = b[6+idx];
194215091d37SBarry Smith   for (i=1; i<n; i++) {
194315091d37SBarry Smith     v     =  aa + 49*ai[i];
194415091d37SBarry Smith     vi    =  aj + ai[i];
194515091d37SBarry Smith     nz    =  diag[i] - ai[i];
194615091d37SBarry Smith     idx   =  7*i;
1947f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
1948f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
1949f1af5d2fSBarry Smith     s7  =  b[6+idx];
195015091d37SBarry Smith     while (nz--) {
195115091d37SBarry Smith       jdx   = 7*(*vi++);
195215091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
195315091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
195415091d37SBarry Smith       x7    = x[6+jdx];
1955f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1956f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1957f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1958f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1959f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1960f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1961f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
196215091d37SBarry Smith       v += 49;
196315091d37SBarry Smith      }
1964f1af5d2fSBarry Smith     x[idx]   = s1;
1965f1af5d2fSBarry Smith     x[1+idx] = s2;
1966f1af5d2fSBarry Smith     x[2+idx] = s3;
1967f1af5d2fSBarry Smith     x[3+idx] = s4;
1968f1af5d2fSBarry Smith     x[4+idx] = s5;
1969f1af5d2fSBarry Smith     x[5+idx] = s6;
1970f1af5d2fSBarry Smith     x[6+idx] = s7;
197115091d37SBarry Smith   }
197215091d37SBarry Smith   /* backward solve the upper triangular */
197315091d37SBarry Smith   for (i=n-1; i>=0; i--){
197415091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
197515091d37SBarry Smith     vi   = aj + diag[i] + 1;
197615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
197715091d37SBarry Smith     idt  = 7*i;
1978f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
1979f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
1980f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
1981f1af5d2fSBarry Smith     s7 = x[6+idt];
198215091d37SBarry Smith     while (nz--) {
198315091d37SBarry Smith       idx   = 7*(*vi++);
198415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
198515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
198615091d37SBarry Smith       x7    = x[6+idx];
1987f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1988f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1989f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1990f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1991f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1992f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1993f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
199415091d37SBarry Smith       v += 49;
199515091d37SBarry Smith     }
199615091d37SBarry Smith     v        = aa + 49*diag[i];
1997f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
1998f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
1999f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2000f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2001f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2002f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2003f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2004f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2005f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2006f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2007f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2008f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2009f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2010f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
201115091d37SBarry Smith   }
201215091d37SBarry Smith 
2013d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2015dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
201615091d37SBarry Smith   PetscFunctionReturn(0);
201715091d37SBarry Smith }
201815091d37SBarry Smith 
2019cee9d6f2SShri Abhyankar #undef __FUNCT__
2020a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
2021a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
202253cca76cSShri Abhyankar {
202353cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
202453cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
202553cca76cSShri Abhyankar     PetscErrorCode    ierr;
202653cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
202753cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
202853cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
202953cca76cSShri Abhyankar     PetscScalar       *x;
203053cca76cSShri Abhyankar     const PetscScalar *b;
203153cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
203253cca76cSShri Abhyankar 
203353cca76cSShri Abhyankar     PetscFunctionBegin;
203453cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
203553cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
203653cca76cSShri Abhyankar     /* forward solve the lower triangular */
203753cca76cSShri Abhyankar     idx    = 0;
203853cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
203953cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
204053cca76cSShri Abhyankar     for (i=1; i<n; i++) {
204153cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
204253cca76cSShri Abhyankar        vi   = aj + ai[i];
204353cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
204453cca76cSShri Abhyankar       idx   = bs*i;
204553cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
204653cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
204753cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
204853cca76cSShri Abhyankar           jdx   = bs*vi[k];
204953cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
205053cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
205153cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
205253cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
205353cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
205453cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
205553cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
205653cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
205753cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
205853cca76cSShri Abhyankar           v   +=  bs2;
205953cca76cSShri Abhyankar         }
206053cca76cSShri Abhyankar 
206153cca76cSShri Abhyankar        x[idx]   = s1;
206253cca76cSShri Abhyankar        x[1+idx] = s2;
206353cca76cSShri Abhyankar        x[2+idx] = s3;
206453cca76cSShri Abhyankar        x[3+idx] = s4;
206553cca76cSShri Abhyankar        x[4+idx] = s5;
206653cca76cSShri Abhyankar        x[5+idx] = s6;
206753cca76cSShri Abhyankar        x[6+idx] = s7;
206853cca76cSShri Abhyankar     }
206953cca76cSShri Abhyankar 
207053cca76cSShri Abhyankar    /* backward solve the upper triangular */
207153cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
207253cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
207353cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
207453cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
207553cca76cSShri Abhyankar      idt = bs*i;
207653cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
207753cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
207853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
207953cca76cSShri Abhyankar       idx   = bs*vi[k];
208053cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
208153cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
208253cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
208353cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
208453cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
208553cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
208653cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
208753cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
208853cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
208953cca76cSShri Abhyankar         v   +=  bs2;
209053cca76cSShri Abhyankar     }
209153cca76cSShri Abhyankar     /* x = inv_diagonal*x */
209253cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
209353cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
209453cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
209553cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
209653cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
209753cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
209853cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
209953cca76cSShri Abhyankar   }
210053cca76cSShri Abhyankar 
210153cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
210253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
210353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
210453cca76cSShri Abhyankar   PetscFunctionReturn(0);
210553cca76cSShri Abhyankar }
210653cca76cSShri Abhyankar 
210753cca76cSShri Abhyankar #undef __FUNCT__
21084a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6"
2109dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
211015091d37SBarry Smith {
211115091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
211215091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
21136849ba73SBarry Smith   PetscErrorCode    ierr;
21145d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
21155d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2116d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2117d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2118d9fead3dSBarry Smith   const PetscScalar *b;
211915091d37SBarry Smith   PetscFunctionBegin;
2120d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21211ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2122f1af5d2fSBarry Smith   t  = a->solve_work;
212315091d37SBarry Smith 
212415091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
212515091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
212615091d37SBarry Smith 
212715091d37SBarry Smith   /* forward solve the lower triangular */
212815091d37SBarry Smith   idx    = 6*(*r++);
2129f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2130f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
2131f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
213215091d37SBarry Smith   for (i=1; i<n; i++) {
213315091d37SBarry Smith     v     = aa + 36*ai[i];
213415091d37SBarry Smith     vi    = aj + ai[i];
213515091d37SBarry Smith     nz    = diag[i] - ai[i];
213615091d37SBarry Smith     idx   = 6*(*r++);
2137f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2138f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
213915091d37SBarry Smith     while (nz--) {
214015091d37SBarry Smith       idx   = 6*(*vi++);
2141f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2142f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2143f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2144f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2145f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2146f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2147f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2148f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
214915091d37SBarry Smith       v += 36;
215015091d37SBarry Smith     }
215115091d37SBarry Smith     idx = 6*i;
2152f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2153f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
2154f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
215515091d37SBarry Smith   }
215615091d37SBarry Smith   /* backward solve the upper triangular */
215715091d37SBarry Smith   for (i=n-1; i>=0; i--){
215815091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
215915091d37SBarry Smith     vi   = aj + diag[i] + 1;
216015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
216115091d37SBarry Smith     idt  = 6*i;
2162f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2163f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
2164f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
216515091d37SBarry Smith     while (nz--) {
216615091d37SBarry Smith       idx   = 6*(*vi++);
2167f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2168f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2169f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
2170f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2171f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2172f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2173f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2174f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2175f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
217615091d37SBarry Smith       v += 36;
217715091d37SBarry Smith     }
217815091d37SBarry Smith     idc = 6*(*c--);
217915091d37SBarry Smith     v   = aa + 36*diag[i];
2180f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2181f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
2182f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2183f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
2184f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2185f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
2186f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2187f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
2188f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2189f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
2190f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2191f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
219215091d37SBarry Smith   }
219315091d37SBarry Smith 
219415091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
219515091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2196d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2198dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
219915091d37SBarry Smith   PetscFunctionReturn(0);
220015091d37SBarry Smith }
220115091d37SBarry Smith 
22026506fda5SShri Abhyankar #undef __FUNCT__
2203a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
2204a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
22056506fda5SShri Abhyankar {
22066506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
22076506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
22086506fda5SShri Abhyankar   PetscErrorCode    ierr;
22096506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
22106506fda5SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
22116506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
22126506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
22136506fda5SShri Abhyankar   const PetscScalar *b;
22146506fda5SShri Abhyankar   PetscFunctionBegin;
22156506fda5SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22166506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22176506fda5SShri Abhyankar   t  = a->solve_work;
22186506fda5SShri Abhyankar 
22196506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22206506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22216506fda5SShri Abhyankar 
22226506fda5SShri Abhyankar   /* forward solve the lower triangular */
22236506fda5SShri Abhyankar   idx    = 6*r[0];
22246506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
22256506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
22266506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
22276506fda5SShri Abhyankar   for (i=1; i<n; i++) {
22286506fda5SShri Abhyankar     v     = aa + 36*ai[i];
22296506fda5SShri Abhyankar     vi    = aj + ai[i];
22306506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
22316506fda5SShri Abhyankar     idx   = 6*r[i];
22326506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
22336506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
22346506fda5SShri Abhyankar     for(m=0;m<nz;m++){
22356506fda5SShri Abhyankar       idx   = 6*vi[m];
22366506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
22376506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
22386506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
22396506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
22406506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
22416506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
22426506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
22436506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
22446506fda5SShri Abhyankar       v += 36;
22456506fda5SShri Abhyankar     }
22466506fda5SShri Abhyankar     idx = 6*i;
22476506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
22486506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
22496506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
22506506fda5SShri Abhyankar   }
22516506fda5SShri Abhyankar   /* backward solve the upper triangular */
22526506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
22536506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
22546506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
22556506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
22566506fda5SShri Abhyankar     idt  = 6*i;
22576506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
22586506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
22596506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
22606506fda5SShri Abhyankar     for(m=0;m<nz;m++){
22616506fda5SShri Abhyankar       idx   = 6*vi[m];
22626506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
22636506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
22646506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
22656506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
22666506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
22676506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
22686506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
22696506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
22706506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
22716506fda5SShri Abhyankar       v += 36;
22726506fda5SShri Abhyankar     }
22736506fda5SShri Abhyankar     idc = 6*c[i];
22746506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
22756506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
22766506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
22776506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
22786506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
22796506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
22806506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
22816506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
22826506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
22836506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
22846506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
22856506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
22866506fda5SShri Abhyankar   }
22876506fda5SShri Abhyankar 
22886506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22896506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22906506fda5SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22916506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22926506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
22936506fda5SShri Abhyankar   PetscFunctionReturn(0);
22946506fda5SShri Abhyankar }
22958f690400SShri Abhyankar 
22968f690400SShri Abhyankar #undef __FUNCT__
22974a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
2298dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
229915091d37SBarry Smith {
230015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2301690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2302dfbe8321SBarry Smith   PetscErrorCode    ierr;
2303690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2304d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2305d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2306d9fead3dSBarry Smith   const PetscScalar *b;
230715091d37SBarry Smith 
230815091d37SBarry Smith   PetscFunctionBegin;
2309d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23101ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
231115091d37SBarry Smith   /* forward solve the lower triangular */
231215091d37SBarry Smith   idx    = 0;
231315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
231415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
231515091d37SBarry Smith   for (i=1; i<n; i++) {
231615091d37SBarry Smith     v     =  aa + 36*ai[i];
231715091d37SBarry Smith     vi    =  aj + ai[i];
231815091d37SBarry Smith     nz    =  diag[i] - ai[i];
231915091d37SBarry Smith     idx   =  6*i;
2320f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2321f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
232215091d37SBarry Smith     while (nz--) {
232315091d37SBarry Smith       jdx   = 6*(*vi++);
232415091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
232515091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2326f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2327f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2328f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2329f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2330f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2331f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
233215091d37SBarry Smith       v += 36;
233315091d37SBarry Smith      }
2334f1af5d2fSBarry Smith     x[idx]   = s1;
2335f1af5d2fSBarry Smith     x[1+idx] = s2;
2336f1af5d2fSBarry Smith     x[2+idx] = s3;
2337f1af5d2fSBarry Smith     x[3+idx] = s4;
2338f1af5d2fSBarry Smith     x[4+idx] = s5;
2339f1af5d2fSBarry Smith     x[5+idx] = s6;
234015091d37SBarry Smith   }
234115091d37SBarry Smith   /* backward solve the upper triangular */
234215091d37SBarry Smith   for (i=n-1; i>=0; i--){
234315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
234415091d37SBarry Smith     vi   = aj + diag[i] + 1;
234515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
234615091d37SBarry Smith     idt  = 6*i;
2347f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2348f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2349f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
235015091d37SBarry Smith     while (nz--) {
235115091d37SBarry Smith       idx   = 6*(*vi++);
235215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
235315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2354f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2355f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2356f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2357f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2358f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2359f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
236015091d37SBarry Smith       v += 36;
236115091d37SBarry Smith     }
236215091d37SBarry Smith     v        = aa + 36*diag[i];
2363f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2364f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2365f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2366f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2367f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2368f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
236915091d37SBarry Smith   }
237015091d37SBarry Smith 
2371d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23721ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2373dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
237415091d37SBarry Smith   PetscFunctionReturn(0);
237515091d37SBarry Smith }
237615091d37SBarry Smith 
2377cee9d6f2SShri Abhyankar #undef __FUNCT__
2378a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2379a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
238053cca76cSShri Abhyankar {
238153cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
238253cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
238353cca76cSShri Abhyankar     PetscErrorCode    ierr;
238453cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
238553cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
238653cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
238753cca76cSShri Abhyankar     PetscScalar       *x;
238853cca76cSShri Abhyankar     const PetscScalar *b;
238953cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
239053cca76cSShri Abhyankar 
239153cca76cSShri Abhyankar     PetscFunctionBegin;
239253cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
239353cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
239453cca76cSShri Abhyankar     /* forward solve the lower triangular */
239553cca76cSShri Abhyankar     idx    = 0;
239653cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
239753cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
239853cca76cSShri Abhyankar     for (i=1; i<n; i++) {
239953cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
240053cca76cSShri Abhyankar        vi   = aj + ai[i];
240153cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
240253cca76cSShri Abhyankar       idx   = bs*i;
240353cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
240453cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
240553cca76cSShri Abhyankar        for(k=0;k<nz;k++){
240653cca76cSShri Abhyankar           jdx   = bs*vi[k];
240753cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
240853cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
240953cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
241053cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
241153cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
241253cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
241353cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
241453cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
241553cca76cSShri Abhyankar           v   +=  bs2;
241653cca76cSShri Abhyankar         }
241753cca76cSShri Abhyankar 
241853cca76cSShri Abhyankar        x[idx]   = s1;
241953cca76cSShri Abhyankar        x[1+idx] = s2;
242053cca76cSShri Abhyankar        x[2+idx] = s3;
242153cca76cSShri Abhyankar        x[3+idx] = s4;
242253cca76cSShri Abhyankar        x[4+idx] = s5;
242353cca76cSShri Abhyankar        x[5+idx] = s6;
242453cca76cSShri Abhyankar     }
242553cca76cSShri Abhyankar 
242653cca76cSShri Abhyankar    /* backward solve the upper triangular */
242753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
242853cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
242953cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
243053cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
243153cca76cSShri Abhyankar      idt = bs*i;
243253cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
243353cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
243453cca76cSShri Abhyankar      for(k=0;k<nz;k++){
243553cca76cSShri Abhyankar       idx   = bs*vi[k];
243653cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
243753cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
243853cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
243953cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
244053cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
244153cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
244253cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
244353cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
244453cca76cSShri Abhyankar         v   +=  bs2;
244553cca76cSShri Abhyankar     }
244653cca76cSShri Abhyankar     /* x = inv_diagonal*x */
244753cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
244853cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
244953cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
245053cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
245153cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
245253cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
245353cca76cSShri Abhyankar   }
245453cca76cSShri Abhyankar 
245553cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
245653cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
245753cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
245853cca76cSShri Abhyankar   PetscFunctionReturn(0);
245953cca76cSShri Abhyankar }
246053cca76cSShri Abhyankar 
246153cca76cSShri Abhyankar #undef __FUNCT__
24624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5"
2463dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
24644e2b4712SSatish Balay {
24654e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
24664e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
24676849ba73SBarry Smith   PetscErrorCode    ierr;
24685d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
24695d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2470d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2471d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
2472d9fead3dSBarry Smith   const PetscScalar *b;
24734e2b4712SSatish Balay 
24744e2b4712SSatish Balay   PetscFunctionBegin;
2475d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2477f1af5d2fSBarry Smith   t  = a->solve_work;
24784e2b4712SSatish Balay 
24794e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
24804e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
24814e2b4712SSatish Balay 
24824e2b4712SSatish Balay   /* forward solve the lower triangular */
24834e2b4712SSatish Balay   idx    = 5*(*r++);
2484f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2485f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
24864e2b4712SSatish Balay   for (i=1; i<n; i++) {
24874e2b4712SSatish Balay     v     = aa + 25*ai[i];
24884e2b4712SSatish Balay     vi    = aj + ai[i];
24894e2b4712SSatish Balay     nz    = diag[i] - ai[i];
24904e2b4712SSatish Balay     idx   = 5*(*r++);
2491f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2492f1af5d2fSBarry Smith     s5  = b[4+idx];
24934e2b4712SSatish Balay     while (nz--) {
24944e2b4712SSatish Balay       idx   = 5*(*vi++);
2495f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2496f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2497f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2498f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2499f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2500f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2501f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
25024e2b4712SSatish Balay       v += 25;
25034e2b4712SSatish Balay     }
25044e2b4712SSatish Balay     idx = 5*i;
2505f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2506f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
25074e2b4712SSatish Balay   }
25084e2b4712SSatish Balay   /* backward solve the upper triangular */
25094e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
25104e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
25114e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
25124e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
25134e2b4712SSatish Balay     idt  = 5*i;
2514f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2515f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
25164e2b4712SSatish Balay     while (nz--) {
25174e2b4712SSatish Balay       idx   = 5*(*vi++);
2518f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2519f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2520f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
2521f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
2522f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
2523f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
2524f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
25254e2b4712SSatish Balay       v += 25;
25264e2b4712SSatish Balay     }
25274e2b4712SSatish Balay     idc = 5*(*c--);
25284e2b4712SSatish Balay     v   = aa + 25*diag[i];
2529f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
2530f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
2531f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
2532f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
2533f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
2534f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
2535f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
2536f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
2537f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
2538f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
25394e2b4712SSatish Balay   }
25404e2b4712SSatish Balay 
25414e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
25424e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2543d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2545dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
25464e2b4712SSatish Balay   PetscFunctionReturn(0);
25474e2b4712SSatish Balay }
25484e2b4712SSatish Balay 
254978bb4007SShri Abhyankar #undef __FUNCT__
2550a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
2551a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
255278bb4007SShri Abhyankar {
255378bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
255478bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
255578bb4007SShri Abhyankar   PetscErrorCode    ierr;
255678bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
255778bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
255878bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
255978bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
256078bb4007SShri Abhyankar   const PetscScalar *b;
256178bb4007SShri Abhyankar 
256278bb4007SShri Abhyankar   PetscFunctionBegin;
256378bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
256478bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
256578bb4007SShri Abhyankar   t  = a->solve_work;
256678bb4007SShri Abhyankar 
256778bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
256878bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
256978bb4007SShri Abhyankar 
257078bb4007SShri Abhyankar   /* forward solve the lower triangular */
257178bb4007SShri Abhyankar   idx    = 5*r[0];
257278bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
257378bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
257478bb4007SShri Abhyankar   for (i=1; i<n; i++) {
257578bb4007SShri Abhyankar     v     = aa + 25*ai[i];
257678bb4007SShri Abhyankar     vi    = aj + ai[i];
257778bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
257878bb4007SShri Abhyankar     idx   = 5*r[i];
257978bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
258078bb4007SShri Abhyankar     s5  = b[4+idx];
258178bb4007SShri Abhyankar     for(m=0;m<nz;m++){
258278bb4007SShri Abhyankar       idx   = 5*vi[m];
258378bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
258478bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
258578bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
258678bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
258778bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
258878bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
258978bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
259078bb4007SShri Abhyankar       v += 25;
259178bb4007SShri Abhyankar     }
259278bb4007SShri Abhyankar     idx = 5*i;
259378bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
259478bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
259578bb4007SShri Abhyankar   }
259678bb4007SShri Abhyankar   /* backward solve the upper triangular */
259778bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
259878bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
259978bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
260078bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
260178bb4007SShri Abhyankar     idt  = 5*i;
260278bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
260378bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
260478bb4007SShri Abhyankar     for(m=0;m<nz;m++){
260578bb4007SShri Abhyankar       idx   = 5*vi[m];
260678bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
260778bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
260878bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
260978bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
261078bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
261178bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
261278bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
261378bb4007SShri Abhyankar       v += 25;
261478bb4007SShri Abhyankar     }
261578bb4007SShri Abhyankar     idc = 5*c[i];
261678bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
261778bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
261878bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
261978bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
262078bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
262178bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
262278bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
262378bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
262478bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
262578bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
262678bb4007SShri Abhyankar   }
262778bb4007SShri Abhyankar 
262878bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
262978bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
263078bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
263178bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
263278bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
263378bb4007SShri Abhyankar   PetscFunctionReturn(0);
263478bb4007SShri Abhyankar }
263578bb4007SShri Abhyankar 
26368f690400SShri Abhyankar #undef __FUNCT__
26374a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
2638dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
263915091d37SBarry Smith {
264015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2641690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2642dfbe8321SBarry Smith   PetscErrorCode    ierr;
2643690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2644d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2645d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
2646d9fead3dSBarry Smith   const PetscScalar *b;
264715091d37SBarry Smith 
264815091d37SBarry Smith   PetscFunctionBegin;
2649d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26501ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
265115091d37SBarry Smith   /* forward solve the lower triangular */
265215091d37SBarry Smith   idx    = 0;
265315091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
265415091d37SBarry Smith   for (i=1; i<n; i++) {
265515091d37SBarry Smith     v     =  aa + 25*ai[i];
265615091d37SBarry Smith     vi    =  aj + ai[i];
265715091d37SBarry Smith     nz    =  diag[i] - ai[i];
265815091d37SBarry Smith     idx   =  5*i;
2659f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
266015091d37SBarry Smith     while (nz--) {
266115091d37SBarry Smith       jdx   = 5*(*vi++);
266215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
2663f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2664f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2665f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2666f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2667f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
266815091d37SBarry Smith       v    += 25;
266915091d37SBarry Smith     }
2670f1af5d2fSBarry Smith     x[idx]   = s1;
2671f1af5d2fSBarry Smith     x[1+idx] = s2;
2672f1af5d2fSBarry Smith     x[2+idx] = s3;
2673f1af5d2fSBarry Smith     x[3+idx] = s4;
2674f1af5d2fSBarry Smith     x[4+idx] = s5;
267515091d37SBarry Smith   }
267615091d37SBarry Smith   /* backward solve the upper triangular */
267715091d37SBarry Smith   for (i=n-1; i>=0; i--){
267815091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
267915091d37SBarry Smith     vi   = aj + diag[i] + 1;
268015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
268115091d37SBarry Smith     idt  = 5*i;
2682f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
2683f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
268415091d37SBarry Smith     while (nz--) {
268515091d37SBarry Smith       idx   = 5*(*vi++);
268615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
2687f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
2688f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
2689f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
2690f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
2691f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
269215091d37SBarry Smith       v    += 25;
269315091d37SBarry Smith     }
269415091d37SBarry Smith     v        = aa + 25*diag[i];
2695f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
2696f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
2697f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
2698f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
2699f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
270015091d37SBarry Smith   }
270115091d37SBarry Smith 
2702d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27031ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2704dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
270515091d37SBarry Smith   PetscFunctionReturn(0);
270615091d37SBarry Smith }
270715091d37SBarry Smith 
2708cee9d6f2SShri Abhyankar #undef __FUNCT__
2709a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
2710a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
271153cca76cSShri Abhyankar {
271253cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
271353cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
271453cca76cSShri Abhyankar   PetscErrorCode    ierr;
271553cca76cSShri Abhyankar   PetscInt          jdx;
271653cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
271753cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
271853cca76cSShri Abhyankar   const PetscScalar *b;
271953cca76cSShri Abhyankar 
272053cca76cSShri Abhyankar   PetscFunctionBegin;
272153cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
272253cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
272353cca76cSShri Abhyankar   /* forward solve the lower triangular */
272453cca76cSShri Abhyankar   idx    = 0;
272553cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
272653cca76cSShri Abhyankar   for (i=1; i<n; i++) {
272753cca76cSShri Abhyankar     v   = aa + 25*ai[i];
272853cca76cSShri Abhyankar     vi  = aj + ai[i];
272953cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
273053cca76cSShri Abhyankar     idx = 5*i;
273153cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
273253cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
273353cca76cSShri Abhyankar       jdx   = 5*vi[k];
273453cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
273553cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
273653cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
273753cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
273853cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
273953cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
274053cca76cSShri Abhyankar       v    += 25;
274153cca76cSShri Abhyankar     }
274253cca76cSShri Abhyankar     x[idx]   = s1;
274353cca76cSShri Abhyankar     x[1+idx] = s2;
274453cca76cSShri Abhyankar     x[2+idx] = s3;
274553cca76cSShri Abhyankar     x[3+idx] = s4;
274653cca76cSShri Abhyankar     x[4+idx] = s5;
274753cca76cSShri Abhyankar   }
274853cca76cSShri Abhyankar 
274953cca76cSShri Abhyankar   /* backward solve the upper triangular */
275053cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
275153cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
275253cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
275353cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
275453cca76cSShri Abhyankar     idt = 5*i;
275553cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
275653cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
275753cca76cSShri Abhyankar     for(k=0;k<nz;k++){
275853cca76cSShri Abhyankar       idx   = 5*vi[k];
275953cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
276053cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
276153cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
276253cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
276353cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
276453cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
276553cca76cSShri Abhyankar       v    += 25;
276653cca76cSShri Abhyankar     }
276753cca76cSShri Abhyankar     /* x = inv_diagonal*x */
276853cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
276953cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
277053cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
277153cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
277253cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
277353cca76cSShri Abhyankar   }
277453cca76cSShri Abhyankar 
277553cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
277653cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
277753cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
277853cca76cSShri Abhyankar   PetscFunctionReturn(0);
277953cca76cSShri Abhyankar }
278053cca76cSShri Abhyankar 
278153cca76cSShri Abhyankar #undef __FUNCT__
27824a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4"
2783dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
27844e2b4712SSatish Balay {
27854e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
27864e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
27876849ba73SBarry Smith   PetscErrorCode    ierr;
27885d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
27895d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2790d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2791d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
2792d9fead3dSBarry Smith   const PetscScalar *b;
27934e2b4712SSatish Balay 
27944e2b4712SSatish Balay   PetscFunctionBegin;
2795d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27961ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2797f1af5d2fSBarry Smith   t  = a->solve_work;
27984e2b4712SSatish Balay 
27994e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
28004e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
28014e2b4712SSatish Balay 
28024e2b4712SSatish Balay   /* forward solve the lower triangular */
28034e2b4712SSatish Balay   idx    = 4*(*r++);
2804f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2805f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
28064e2b4712SSatish Balay   for (i=1; i<n; i++) {
28074e2b4712SSatish Balay     v     = aa + 16*ai[i];
28084e2b4712SSatish Balay     vi    = aj + ai[i];
28094e2b4712SSatish Balay     nz    = diag[i] - ai[i];
28104e2b4712SSatish Balay     idx   = 4*(*r++);
2811f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
28124e2b4712SSatish Balay     while (nz--) {
28134e2b4712SSatish Balay       idx   = 4*(*vi++);
2814f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
2815f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2816f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2817f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2818f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
28194e2b4712SSatish Balay       v    += 16;
28204e2b4712SSatish Balay     }
28214e2b4712SSatish Balay     idx        = 4*i;
2822f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2823f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
28244e2b4712SSatish Balay   }
28254e2b4712SSatish Balay   /* backward solve the upper triangular */
28264e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
28274e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
28284e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
28294e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
28304e2b4712SSatish Balay     idt  = 4*i;
2831f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2832f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
28334e2b4712SSatish Balay     while (nz--) {
28344e2b4712SSatish Balay       idx   = 4*(*vi++);
2835f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2836f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2837f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
2838f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
2839f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
2840f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
28414e2b4712SSatish Balay       v += 16;
28424e2b4712SSatish Balay     }
28434e2b4712SSatish Balay     idc      = 4*(*c--);
28444e2b4712SSatish Balay     v        = aa + 16*diag[i];
2845f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
2846f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
2847f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
2848f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
28494e2b4712SSatish Balay   }
28504e2b4712SSatish Balay 
28514e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28524e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2853d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28541ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2855dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
28564e2b4712SSatish Balay   PetscFunctionReturn(0);
28574e2b4712SSatish Balay }
2858f26ec98cSKris Buschelman 
28598f690400SShri Abhyankar #undef __FUNCT__
2860a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
2861a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
286278bb4007SShri Abhyankar {
286378bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
286478bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
286578bb4007SShri Abhyankar   PetscErrorCode    ierr;
286678bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
286778bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
286878bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
286978bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
287078bb4007SShri Abhyankar   const PetscScalar *b;
287178bb4007SShri Abhyankar 
287278bb4007SShri Abhyankar   PetscFunctionBegin;
287378bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
287478bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
287578bb4007SShri Abhyankar   t  = a->solve_work;
287678bb4007SShri Abhyankar 
287778bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
287878bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
287978bb4007SShri Abhyankar 
288078bb4007SShri Abhyankar   /* forward solve the lower triangular */
288178bb4007SShri Abhyankar   idx    = 4*r[0];
288278bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
288378bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
288478bb4007SShri Abhyankar   for (i=1; i<n; i++) {
288578bb4007SShri Abhyankar     v     = aa + 16*ai[i];
288678bb4007SShri Abhyankar     vi    = aj + ai[i];
288778bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
288878bb4007SShri Abhyankar     idx   = 4*r[i];
288978bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
289078bb4007SShri Abhyankar     for(m=0;m<nz;m++){
289178bb4007SShri Abhyankar       idx   = 4*vi[m];
289278bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
289378bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
289478bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
289578bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
289678bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
289778bb4007SShri Abhyankar       v    += 16;
289878bb4007SShri Abhyankar     }
289978bb4007SShri Abhyankar     idx        = 4*i;
290078bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
290178bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
290278bb4007SShri Abhyankar   }
290378bb4007SShri Abhyankar   /* backward solve the upper triangular */
290478bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
290578bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
290678bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
290778bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
290878bb4007SShri Abhyankar     idt  = 4*i;
290978bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
291078bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
291178bb4007SShri Abhyankar     for(m=0;m<nz;m++){
291278bb4007SShri Abhyankar       idx   = 4*vi[m];
291378bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
291478bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
291578bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
291678bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
291778bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
291878bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
291978bb4007SShri Abhyankar       v += 16;
292078bb4007SShri Abhyankar     }
292178bb4007SShri Abhyankar     idc      = 4*c[i];
292278bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
292378bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
292478bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
292578bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
292678bb4007SShri Abhyankar   }
292778bb4007SShri Abhyankar 
292878bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
292978bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
293078bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
293178bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
293278bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
293378bb4007SShri Abhyankar   PetscFunctionReturn(0);
293478bb4007SShri Abhyankar }
293578bb4007SShri Abhyankar 
293678bb4007SShri Abhyankar #undef __FUNCT__
2937f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
2938dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
2939f26ec98cSKris Buschelman {
2940f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2941f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
29426849ba73SBarry Smith   PetscErrorCode    ierr;
29435d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
29445d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
2945d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2946d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
2947d9fead3dSBarry Smith   PetscScalar       *x;
2948d9fead3dSBarry Smith   const PetscScalar *b;
2949f26ec98cSKris Buschelman 
2950f26ec98cSKris Buschelman   PetscFunctionBegin;
2951d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29521ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2953f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
2954f26ec98cSKris Buschelman 
2955f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
2956f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
2957f26ec98cSKris Buschelman 
2958f26ec98cSKris Buschelman   /* forward solve the lower triangular */
2959f26ec98cSKris Buschelman   idx    = 4*(*r++);
2960f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
2961f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
2962f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
2963f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
2964f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
2965f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
2966f26ec98cSKris Buschelman     vi    = aj + ai[i];
2967f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
2968f26ec98cSKris Buschelman     idx   = 4*(*r++);
2969f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
2970f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
2971f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
2972f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
2973f26ec98cSKris Buschelman     while (nz--) {
2974f26ec98cSKris Buschelman       idx   = 4*(*vi++);
2975f26ec98cSKris Buschelman       x1  = t[idx];
2976f26ec98cSKris Buschelman       x2  = t[1+idx];
2977f26ec98cSKris Buschelman       x3  = t[2+idx];
2978f26ec98cSKris Buschelman       x4  = t[3+idx];
2979f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
2980f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
2981f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
2982f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
2983f26ec98cSKris Buschelman       v    += 16;
2984f26ec98cSKris Buschelman     }
2985f26ec98cSKris Buschelman     idx        = 4*i;
2986f26ec98cSKris Buschelman     t[idx]   = s1;
2987f26ec98cSKris Buschelman     t[1+idx] = s2;
2988f26ec98cSKris Buschelman     t[2+idx] = s3;
2989f26ec98cSKris Buschelman     t[3+idx] = s4;
2990f26ec98cSKris Buschelman   }
2991f26ec98cSKris Buschelman   /* backward solve the upper triangular */
2992f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
2993f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
2994f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
2995f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
2996f26ec98cSKris Buschelman     idt  = 4*i;
2997f26ec98cSKris Buschelman     s1 = t[idt];
2998f26ec98cSKris Buschelman     s2 = t[1+idt];
2999f26ec98cSKris Buschelman     s3 = t[2+idt];
3000f26ec98cSKris Buschelman     s4 = t[3+idt];
3001f26ec98cSKris Buschelman     while (nz--) {
3002f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3003f26ec98cSKris Buschelman       x1  = t[idx];
3004f26ec98cSKris Buschelman       x2  = t[1+idx];
3005f26ec98cSKris Buschelman       x3  = t[2+idx];
3006f26ec98cSKris Buschelman       x4  = t[3+idx];
3007f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3008f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3009f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3010f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3011f26ec98cSKris Buschelman       v += 16;
3012f26ec98cSKris Buschelman     }
3013f26ec98cSKris Buschelman     idc      = 4*(*c--);
3014f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3015f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3016f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3017f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3018f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3019f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3020f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3021f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3022f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3023f26ec98cSKris Buschelman  }
3024f26ec98cSKris Buschelman 
3025f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3026f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3027d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30281ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3029dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3030f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3031f26ec98cSKris Buschelman }
3032f26ec98cSKris Buschelman 
303324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
303424c233c2SKris Buschelman 
303524c233c2SKris Buschelman #include PETSC_HAVE_SSE
303624c233c2SKris Buschelman 
303724c233c2SKris Buschelman #undef __FUNCT__
303824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3039dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
304024c233c2SKris Buschelman {
304124c233c2SKris Buschelman   /*
304224c233c2SKris Buschelman      Note: This code uses demotion of double
304324c233c2SKris Buschelman      to float when performing the mixed-mode computation.
304424c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
304524c233c2SKris Buschelman   */
304624c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
304724c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
30486849ba73SBarry Smith   PetscErrorCode ierr;
30495d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
30505d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
305124c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
305287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
305324c233c2SKris Buschelman 
305424c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
305524c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
305624c233c2SKris Buschelman   unsigned long   offset;
305724c233c2SKris Buschelman 
305824c233c2SKris Buschelman   PetscFunctionBegin;
305924c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
306024c233c2SKris Buschelman 
306124c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
306224c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
306324c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
306424c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
306524c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
306624c233c2SKris Buschelman 
30671ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
30681ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
306924c233c2SKris Buschelman     t  = a->solve_work;
307024c233c2SKris Buschelman 
307124c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
307224c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
307324c233c2SKris Buschelman 
307424c233c2SKris Buschelman     /* forward solve the lower triangular */
307524c233c2SKris Buschelman     idx  = 4*(*r++);
307624c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
307724c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
307824c233c2SKris Buschelman     v    =  aa + 16*ai[1];
307924c233c2SKris Buschelman 
308024c233c2SKris Buschelman     for (i=1; i<n;) {
308124c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
308224c233c2SKris Buschelman       vi   =  aj      + ai[i];
308324c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
308424c233c2SKris Buschelman       idx  =  4*(*r++);
308524c233c2SKris Buschelman 
308624c233c2SKris Buschelman       /* Demote sum from double to float */
308724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
308824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
308924c233c2SKris Buschelman 
309024c233c2SKris Buschelman       while (nz--) {
309124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
309224c233c2SKris Buschelman         idx = 4*(*vi++);
309324c233c2SKris Buschelman 
309424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
309524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
309624c233c2SKris Buschelman 
309724c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
309824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
309924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
310024c233c2SKris Buschelman 
310124c233c2SKris Buschelman           /* First Column */
310224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
310324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
310424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
310524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
310624c233c2SKris Buschelman 
310724c233c2SKris Buschelman           /* Second Column */
310824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
310924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
311024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
311124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
311224c233c2SKris Buschelman 
311324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
311424c233c2SKris Buschelman 
311524c233c2SKris Buschelman           /* Third Column */
311624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
311724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
311824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
311924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
312024c233c2SKris Buschelman 
312124c233c2SKris Buschelman           /* Fourth Column */
312224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
312324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
312424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
312524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
312624c233c2SKris Buschelman         SSE_INLINE_END_2
312724c233c2SKris Buschelman 
312824c233c2SKris Buschelman         v  += 16;
312924c233c2SKris Buschelman       }
313024c233c2SKris Buschelman       idx = 4*i;
313124c233c2SKris Buschelman       v   = aa + 16*ai[++i];
313224c233c2SKris Buschelman       PREFETCH_NTA(v);
313324c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
313424c233c2SKris Buschelman 
313524c233c2SKris Buschelman       /* Promote result from float to double */
313624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
313724c233c2SKris Buschelman     }
313824c233c2SKris Buschelman     /* backward solve the upper triangular */
313924c233c2SKris Buschelman     idt  = 4*(n-1);
314024c233c2SKris Buschelman     ai16 = 16*diag[n-1];
314124c233c2SKris Buschelman     v    = aa + ai16 + 16;
314224c233c2SKris Buschelman     for (i=n-1; i>=0;){
314324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
314424c233c2SKris Buschelman       vi = aj + diag[i] + 1;
314524c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
314624c233c2SKris Buschelman 
314724c233c2SKris Buschelman       /* Demote accumulator from double to float */
314824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
314924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
315024c233c2SKris Buschelman 
315124c233c2SKris Buschelman       while (nz--) {
315224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
315324c233c2SKris Buschelman         idx = 4*(*vi++);
315424c233c2SKris Buschelman 
315524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
315624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
315724c233c2SKris Buschelman 
315824c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
315924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
316024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
316124c233c2SKris Buschelman 
316224c233c2SKris Buschelman           /* First Column */
316324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
316424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
316524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
316624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
316724c233c2SKris Buschelman 
316824c233c2SKris Buschelman           /* Second Column */
316924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
317024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
317124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
317224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
317324c233c2SKris Buschelman 
317424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
317524c233c2SKris Buschelman 
317624c233c2SKris Buschelman           /* Third Column */
317724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
317824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
317924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
318024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
318124c233c2SKris Buschelman 
318224c233c2SKris Buschelman           /* Fourth Column */
318324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
318424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
318524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
318624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
318724c233c2SKris Buschelman         SSE_INLINE_END_2
318824c233c2SKris Buschelman         v  += 16;
318924c233c2SKris Buschelman       }
319024c233c2SKris Buschelman       v    = aa + ai16;
319124c233c2SKris Buschelman       ai16 = 16*diag[--i];
319224c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
319324c233c2SKris Buschelman       /*
319424c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
319524c233c2SKris Buschelman          which was inverted as part of the factorization
319624c233c2SKris Buschelman       */
319724c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
319824c233c2SKris Buschelman         /* First Column */
319924c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
320024c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
320124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
320224c233c2SKris Buschelman 
320324c233c2SKris Buschelman         /* Second Column */
320424c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
320524c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
320624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
320724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
320824c233c2SKris Buschelman 
320924c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
321024c233c2SKris Buschelman 
321124c233c2SKris Buschelman         /* Third Column */
321224c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
321324c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
321424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
321524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
321624c233c2SKris Buschelman 
321724c233c2SKris Buschelman         /* Fourth Column */
321824c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
321924c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
322024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
322124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
322224c233c2SKris Buschelman 
322324c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
322424c233c2SKris Buschelman       SSE_INLINE_END_3
322524c233c2SKris Buschelman 
322624c233c2SKris Buschelman       /* Promote solution from float to double */
322724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
322824c233c2SKris Buschelman 
322924c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
323024c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
323124c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
323224c233c2SKris Buschelman       idc  = 4*(*c--);
323324c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
323424c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
323524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
323624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
323724c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
323824c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
323924c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
324024c233c2SKris Buschelman       SSE_INLINE_END_2
324124c233c2SKris Buschelman       v    = aa + ai16 + 16;
324224c233c2SKris Buschelman       idt -= 4;
324324c233c2SKris Buschelman     }
324424c233c2SKris Buschelman 
324524c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
324624c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
32471ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
32481ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3249dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
325024c233c2SKris Buschelman   SSE_SCOPE_END;
325124c233c2SKris Buschelman   PetscFunctionReturn(0);
325224c233c2SKris Buschelman }
325324c233c2SKris Buschelman 
325424c233c2SKris Buschelman #endif
32550ef38995SBarry Smith 
32560ef38995SBarry Smith 
32574e2b4712SSatish Balay /*
32584e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
32594e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
32604e2b4712SSatish Balay */
32614a2ae208SSatish Balay #undef __FUNCT__
32624a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
3263dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
32644e2b4712SSatish Balay {
32654e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3266356650c2SBarry Smith   PetscInt          n=a->mbs;
3267356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
3268dfbe8321SBarry Smith   PetscErrorCode    ierr;
3269356650c2SBarry Smith   const PetscInt    *diag = a->diag;
3270d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
3271d9fead3dSBarry Smith   PetscScalar       *x;
3272d9fead3dSBarry Smith   const PetscScalar *b;
32734e2b4712SSatish Balay 
32744e2b4712SSatish Balay   PetscFunctionBegin;
3275d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32761ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
32774e2b4712SSatish Balay 
3278aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
32792853dc0eSBarry Smith   {
328087828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
32812853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
32822853dc0eSBarry Smith   }
3283aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
32842853dc0eSBarry Smith   {
328587828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
32862853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
32872853dc0eSBarry Smith   }
3288aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
32892853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3290e1293385SBarry Smith #else
329130d4dcafSBarry Smith   {
329287828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3293d9fead3dSBarry Smith     const MatScalar *v;
3294356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
3295356650c2SBarry Smith     const PetscInt  *vi;
3296e1293385SBarry Smith 
32974e2b4712SSatish Balay   /* forward solve the lower triangular */
32984e2b4712SSatish Balay   idx    = 0;
3299e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
33004e2b4712SSatish Balay   for (i=1; i<n; i++) {
33014e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
33024e2b4712SSatish Balay     vi    =  aj      + ai[i];
33034e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3304e1293385SBarry Smith     idx   +=  4;
3305f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
33064e2b4712SSatish Balay     while (nz--) {
33074e2b4712SSatish Balay       jdx   = 4*(*vi++);
33084e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3309f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3310f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3311f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3312f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
33134e2b4712SSatish Balay       v    += 16;
33144e2b4712SSatish Balay     }
3315f1af5d2fSBarry Smith     x[idx]   = s1;
3316f1af5d2fSBarry Smith     x[1+idx] = s2;
3317f1af5d2fSBarry Smith     x[2+idx] = s3;
3318f1af5d2fSBarry Smith     x[3+idx] = s4;
33194e2b4712SSatish Balay   }
33204e2b4712SSatish Balay   /* backward solve the upper triangular */
33214e555682SBarry Smith   idt = 4*(n-1);
33224e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
33234e555682SBarry Smith     ai16 = 16*diag[i];
33244e555682SBarry Smith     v    = aa + ai16 + 16;
33254e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
33264e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3327f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3328f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
33294e2b4712SSatish Balay     while (nz--) {
33304e2b4712SSatish Balay       idx   = 4*(*vi++);
33314e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3332f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3333f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3334f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3335f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
33364e2b4712SSatish Balay       v    += 16;
33374e2b4712SSatish Balay     }
33384e555682SBarry Smith     v        = aa + ai16;
3339f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3340f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3341f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3342f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3343329f5518SBarry Smith     idt -= 4;
33444e2b4712SSatish Balay   }
334530d4dcafSBarry Smith   }
3346e1293385SBarry Smith #endif
33474e2b4712SSatish Balay 
3348d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3350dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
33514e2b4712SSatish Balay   PetscFunctionReturn(0);
33524e2b4712SSatish Balay }
33534e2b4712SSatish Balay 
3354b2b2dd24SShri Abhyankar #undef __FUNCT__
3355a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3356a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3357b2b2dd24SShri Abhyankar {
3358b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3359b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3360b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3361b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3362b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3363b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3364b2b2dd24SShri Abhyankar     PetscScalar       *x;
3365b2b2dd24SShri Abhyankar     const PetscScalar *b;
3366b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3367cee9d6f2SShri Abhyankar 
3368b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3369b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3370b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3371b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3372b2b2dd24SShri Abhyankar     idx    = 0;
3373b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3374b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3375b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3376b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3377b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3378b2b2dd24SShri Abhyankar       idx   = bs*i;
3379b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3380b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
3381b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
3382b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3383b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3384b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3385b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3386b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3387b2b2dd24SShri Abhyankar 
3388b2b2dd24SShri Abhyankar           v   +=  bs2;
3389b2b2dd24SShri Abhyankar         }
3390b2b2dd24SShri Abhyankar 
3391b2b2dd24SShri Abhyankar        x[idx]   = s1;
3392b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3393b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3394b2b2dd24SShri Abhyankar        x[3+idx] = s4;
3395b2b2dd24SShri Abhyankar     }
3396b2b2dd24SShri Abhyankar 
3397b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3398b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3399b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3400b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3401b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3402b2b2dd24SShri Abhyankar      idt = bs*i;
3403b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3404b2b2dd24SShri Abhyankar 
3405b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
3406b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
3407b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3408b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3409b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3410b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3411b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3412b2b2dd24SShri Abhyankar 
3413b2b2dd24SShri Abhyankar         v   +=  bs2;
3414b2b2dd24SShri Abhyankar     }
3415b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3416b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3417b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3418b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3419b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3420b2b2dd24SShri Abhyankar 
3421b2b2dd24SShri Abhyankar   }
3422b2b2dd24SShri Abhyankar 
3423b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3424b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3425b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3426b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3427b2b2dd24SShri Abhyankar }
3428cee9d6f2SShri Abhyankar 
3429cee9d6f2SShri Abhyankar #undef __FUNCT__
3430f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3431dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3432f26ec98cSKris Buschelman {
3433f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3434690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3435dfbe8321SBarry Smith   PetscErrorCode ierr;
3436690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3437f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3438f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3439f26ec98cSKris Buschelman 
3440f26ec98cSKris Buschelman   PetscFunctionBegin;
34411ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
34421ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3443f26ec98cSKris Buschelman 
3444f26ec98cSKris Buschelman   {
3445f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3446f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3447690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3448f26ec98cSKris Buschelman 
3449f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3450f26ec98cSKris Buschelman     idx  = 0;
3451f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3452f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3453f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3454f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3455f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3456f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3457f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3458f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3459f26ec98cSKris Buschelman       idx   +=  4;
3460f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3461f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3462f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3463f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3464f26ec98cSKris Buschelman       while (nz--) {
3465f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3466f26ec98cSKris Buschelman         x1  = t[jdx];
3467f26ec98cSKris Buschelman         x2  = t[1+jdx];
3468f26ec98cSKris Buschelman         x3  = t[2+jdx];
3469f26ec98cSKris Buschelman         x4  = t[3+jdx];
3470f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3471f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3472f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3473f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3474f26ec98cSKris Buschelman         v    += 16;
3475f26ec98cSKris Buschelman       }
3476f26ec98cSKris Buschelman       t[idx]   = s1;
3477f26ec98cSKris Buschelman       t[1+idx] = s2;
3478f26ec98cSKris Buschelman       t[2+idx] = s3;
3479f26ec98cSKris Buschelman       t[3+idx] = s4;
3480f26ec98cSKris Buschelman     }
3481f26ec98cSKris Buschelman     /* backward solve the upper triangular */
3482f26ec98cSKris Buschelman     idt = 4*(n-1);
3483f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
3484f26ec98cSKris Buschelman       ai16 = 16*diag[i];
3485f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
3486f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
3487f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
3488f26ec98cSKris Buschelman       s1   = t[idt];
3489f26ec98cSKris Buschelman       s2   = t[1+idt];
3490f26ec98cSKris Buschelman       s3   = t[2+idt];
3491f26ec98cSKris Buschelman       s4   = t[3+idt];
3492f26ec98cSKris Buschelman       while (nz--) {
3493f26ec98cSKris Buschelman         idx = 4*(*vi++);
3494f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
3495f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
3496f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
3497f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
3498f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3499f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3500f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3501f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3502f26ec98cSKris Buschelman         v    += 16;
3503f26ec98cSKris Buschelman       }
3504f26ec98cSKris Buschelman       v        = aa + ai16;
3505f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
3506f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
3507f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
3508f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
3509f26ec98cSKris Buschelman       idt -= 4;
3510f26ec98cSKris Buschelman     }
3511f26ec98cSKris Buschelman   }
3512f26ec98cSKris Buschelman 
35131ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
35141ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3515dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3516f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3517f26ec98cSKris Buschelman }
3518f26ec98cSKris Buschelman 
35193660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
35203660e330SKris Buschelman 
35213660e330SKris Buschelman #include PETSC_HAVE_SSE
35223660e330SKris Buschelman #undef __FUNCT__
35237cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
3524dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
35253660e330SKris Buschelman {
35263660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
35272aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
3528dfbe8321SBarry Smith   PetscErrorCode ierr;
3529dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
35303660e330SKris Buschelman   MatScalar      *aa=a->a;
353187828ca2SBarry Smith   PetscScalar    *x,*b;
35323660e330SKris Buschelman 
35333660e330SKris Buschelman   PetscFunctionBegin;
35343660e330SKris Buschelman   SSE_SCOPE_BEGIN;
35353660e330SKris Buschelman   /*
35363660e330SKris Buschelman      Note: This code currently uses demotion of double
35373660e330SKris Buschelman      to float when performing the mixed-mode computation.
35383660e330SKris Buschelman      This may not be numerically reasonable for all applications.
35393660e330SKris Buschelman   */
35403660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
35413660e330SKris Buschelman 
35421ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
35431ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
35443660e330SKris Buschelman   {
3545eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
3546eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
35472aa5897fSKris Buschelman     int            nz,i,idt,ai16;
35482aa5897fSKris Buschelman     unsigned int   jdx,idx;
35492aa5897fSKris Buschelman     unsigned short *vi;
3550eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
35513660e330SKris Buschelman 
3552eb05f457SKris Buschelman     /* First block is the identity. */
35533660e330SKris Buschelman     idx  = 0;
3554eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
35552aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
35563660e330SKris Buschelman 
35573660e330SKris Buschelman     for (i=1; i<n;) {
35583660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
35593660e330SKris Buschelman       vi   =  aj      + ai[i];
35603660e330SKris Buschelman       nz   =  diag[i] - ai[i];
35613660e330SKris Buschelman       idx +=  4;
35623660e330SKris Buschelman 
3563eb05f457SKris Buschelman       /* Demote RHS from double to float. */
3564eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
3565eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
35663660e330SKris Buschelman 
35673660e330SKris Buschelman       while (nz--) {
35683660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
35692aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
35703660e330SKris Buschelman 
35713660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
3572eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
35733660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
35743660e330SKris Buschelman 
35753660e330SKris Buschelman           /* First Column */
35763660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
35773660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
35783660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
35793660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
35803660e330SKris Buschelman 
35813660e330SKris Buschelman           /* Second Column */
35823660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
35833660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
35843660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
35853660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
35863660e330SKris Buschelman 
35873660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
35883660e330SKris Buschelman 
35893660e330SKris Buschelman           /* Third Column */
35903660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
35913660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
35923660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
35933660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
35943660e330SKris Buschelman 
35953660e330SKris Buschelman           /* Fourth Column */
35963660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
35973660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
35983660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
35993660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
36003660e330SKris Buschelman         SSE_INLINE_END_2
36013660e330SKris Buschelman 
36023660e330SKris Buschelman         v  += 16;
36033660e330SKris Buschelman       }
36043660e330SKris Buschelman       v    =  aa + 16*ai[++i];
36053660e330SKris Buschelman       PREFETCH_NTA(v);
3606eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
36073660e330SKris Buschelman     }
3608eb05f457SKris Buschelman 
3609eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
3610eb05f457SKris Buschelman 
36113660e330SKris Buschelman     idt  = 4*(n-1);
36123660e330SKris Buschelman     ai16 = 16*diag[n-1];
36133660e330SKris Buschelman     v    = aa + ai16 + 16;
36143660e330SKris Buschelman     for (i=n-1; i>=0;){
36153660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
36163660e330SKris Buschelman       vi = aj + diag[i] + 1;
36173660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
36183660e330SKris Buschelman 
3619eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
36203660e330SKris Buschelman 
36213660e330SKris Buschelman       while (nz--) {
36223660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
36232aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
36243660e330SKris Buschelman 
36253660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
3626eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
36273660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
36283660e330SKris Buschelman 
36293660e330SKris Buschelman           /* First Column */
36303660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
36313660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
36323660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
36333660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
36343660e330SKris Buschelman 
36353660e330SKris Buschelman           /* Second Column */
36363660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
36373660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
36383660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
36393660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
36403660e330SKris Buschelman 
36413660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
36423660e330SKris Buschelman 
36433660e330SKris Buschelman           /* Third Column */
36443660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
36453660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
36463660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
36473660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
36483660e330SKris Buschelman 
36493660e330SKris Buschelman           /* Fourth Column */
36503660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
36513660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
36523660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
36533660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
36543660e330SKris Buschelman         SSE_INLINE_END_2
36553660e330SKris Buschelman         v  += 16;
36563660e330SKris Buschelman       }
36573660e330SKris Buschelman       v    = aa + ai16;
36583660e330SKris Buschelman       ai16 = 16*diag[--i];
36593660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
36603660e330SKris Buschelman       /*
36613660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
36623660e330SKris Buschelman          which was inverted as part of the factorization
36633660e330SKris Buschelman       */
3664eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
36653660e330SKris Buschelman         /* First Column */
36663660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
36673660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
36683660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
36693660e330SKris Buschelman 
36703660e330SKris Buschelman         /* Second Column */
36713660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
36723660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
36733660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
36743660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
36753660e330SKris Buschelman 
36763660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
36773660e330SKris Buschelman 
36783660e330SKris Buschelman         /* Third Column */
36793660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
36803660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
36813660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
36823660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
36833660e330SKris Buschelman 
36843660e330SKris Buschelman         /* Fourth Column */
36853660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
36863660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
36873660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
36883660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
36893660e330SKris Buschelman 
36903660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
36913660e330SKris Buschelman       SSE_INLINE_END_3
36923660e330SKris Buschelman 
36933660e330SKris Buschelman       v    = aa + ai16 + 16;
36943660e330SKris Buschelman       idt -= 4;
36953660e330SKris Buschelman     }
3696eb05f457SKris Buschelman 
3697eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
3698eb05f457SKris Buschelman     idt = 4*(n-1);
3699eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
3700eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
3701eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
3702eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
3703eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
3704eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
3705eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
3706eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
3707eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
370854693613SKris Buschelman       idt -= 4;
37093660e330SKris Buschelman     }
3710eb05f457SKris Buschelman 
3711eb05f457SKris Buschelman   } /* End of artificial scope. */
37121ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
37131ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3714dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
37153660e330SKris Buschelman   SSE_SCOPE_END;
37163660e330SKris Buschelman   PetscFunctionReturn(0);
37173660e330SKris Buschelman }
37183660e330SKris Buschelman 
37197cf1b8d3SKris Buschelman #undef __FUNCT__
37207cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
3721dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
37227cf1b8d3SKris Buschelman {
37237cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
37247cf1b8d3SKris Buschelman   int            *aj=a->j;
3725dfbe8321SBarry Smith   PetscErrorCode ierr;
3726dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
37277cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
37287cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
37297cf1b8d3SKris Buschelman 
37307cf1b8d3SKris Buschelman   PetscFunctionBegin;
37317cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
37327cf1b8d3SKris Buschelman   /*
37337cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
37347cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
37357cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
37367cf1b8d3SKris Buschelman   */
37377cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
37387cf1b8d3SKris Buschelman 
37391ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
37401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
37417cf1b8d3SKris Buschelman   {
37427cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
37437cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
37447cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
37457cf1b8d3SKris Buschelman     int       jdx,idx;
37467cf1b8d3SKris Buschelman     int       *vi;
37477cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
37487cf1b8d3SKris Buschelman 
37497cf1b8d3SKris Buschelman     /* First block is the identity. */
37507cf1b8d3SKris Buschelman     idx  = 0;
37517cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
37527cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
37537cf1b8d3SKris Buschelman 
37547cf1b8d3SKris Buschelman     for (i=1; i<n;) {
37557cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
37567cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
37577cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
37587cf1b8d3SKris Buschelman       idx +=  4;
37597cf1b8d3SKris Buschelman 
37607cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
37617cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
37627cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
37637cf1b8d3SKris Buschelman 
37647cf1b8d3SKris Buschelman       while (nz--) {
37657cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
37667cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
37677cf1b8d3SKris Buschelman /*          jdx = *vi++; */
37687cf1b8d3SKris Buschelman 
37697cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
37707cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
37717cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
37727cf1b8d3SKris Buschelman 
37737cf1b8d3SKris Buschelman           /* First Column */
37747cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
37757cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
37767cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
37777cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
37787cf1b8d3SKris Buschelman 
37797cf1b8d3SKris Buschelman           /* Second Column */
37807cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
37817cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
37827cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
37837cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
37847cf1b8d3SKris Buschelman 
37857cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
37867cf1b8d3SKris Buschelman 
37877cf1b8d3SKris Buschelman           /* Third Column */
37887cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
37897cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
37907cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
37917cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
37927cf1b8d3SKris Buschelman 
37937cf1b8d3SKris Buschelman           /* Fourth Column */
37947cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
37957cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
37967cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
37977cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
37987cf1b8d3SKris Buschelman         SSE_INLINE_END_2
37997cf1b8d3SKris Buschelman 
38007cf1b8d3SKris Buschelman         v  += 16;
38017cf1b8d3SKris Buschelman       }
38027cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
38037cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
38047cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
38057cf1b8d3SKris Buschelman     }
38067cf1b8d3SKris Buschelman 
38077cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
38087cf1b8d3SKris Buschelman 
38097cf1b8d3SKris Buschelman     idt  = 4*(n-1);
38107cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
38117cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
38127cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
38137cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
38147cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
38157cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
38167cf1b8d3SKris Buschelman 
38177cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
38187cf1b8d3SKris Buschelman 
38197cf1b8d3SKris Buschelman       while (nz--) {
38207cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
38217cf1b8d3SKris Buschelman         idx = 4*(*vi++);
38227cf1b8d3SKris Buschelman /*          idx = *vi++; */
38237cf1b8d3SKris Buschelman 
38247cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
38257cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
38267cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
38277cf1b8d3SKris Buschelman 
38287cf1b8d3SKris Buschelman           /* First Column */
38297cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
38307cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
38317cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
38327cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
38337cf1b8d3SKris Buschelman 
38347cf1b8d3SKris Buschelman           /* Second Column */
38357cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
38367cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
38377cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
38387cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
38397cf1b8d3SKris Buschelman 
38407cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
38417cf1b8d3SKris Buschelman 
38427cf1b8d3SKris Buschelman           /* Third Column */
38437cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
38447cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
38457cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
38467cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
38477cf1b8d3SKris Buschelman 
38487cf1b8d3SKris Buschelman           /* Fourth Column */
38497cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
38507cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
38517cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
38527cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
38537cf1b8d3SKris Buschelman         SSE_INLINE_END_2
38547cf1b8d3SKris Buschelman         v  += 16;
38557cf1b8d3SKris Buschelman       }
38567cf1b8d3SKris Buschelman       v    = aa + ai16;
38577cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
38587cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
38597cf1b8d3SKris Buschelman       /*
38607cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
38617cf1b8d3SKris Buschelman          which was inverted as part of the factorization
38627cf1b8d3SKris Buschelman       */
38637cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
38647cf1b8d3SKris Buschelman         /* First Column */
38657cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
38667cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
38677cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
38687cf1b8d3SKris Buschelman 
38697cf1b8d3SKris Buschelman         /* Second Column */
38707cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
38717cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
38727cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
38737cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
38747cf1b8d3SKris Buschelman 
38757cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
38767cf1b8d3SKris Buschelman 
38777cf1b8d3SKris Buschelman         /* Third Column */
38787cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
38797cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
38807cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
38817cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
38827cf1b8d3SKris Buschelman 
38837cf1b8d3SKris Buschelman         /* Fourth Column */
38847cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
38857cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
38867cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
38877cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
38887cf1b8d3SKris Buschelman 
38897cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
38907cf1b8d3SKris Buschelman       SSE_INLINE_END_3
38917cf1b8d3SKris Buschelman 
38927cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
38937cf1b8d3SKris Buschelman       idt -= 4;
38947cf1b8d3SKris Buschelman     }
38957cf1b8d3SKris Buschelman 
38967cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
38977cf1b8d3SKris Buschelman     idt = 4*(n-1);
38987cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
38997cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
39007cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
39017cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
39027cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
39037cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
39047cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
39057cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
39067cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
39077cf1b8d3SKris Buschelman       idt -= 4;
39087cf1b8d3SKris Buschelman     }
39097cf1b8d3SKris Buschelman 
39107cf1b8d3SKris Buschelman   } /* End of artificial scope. */
39111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
39121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3913dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
39147cf1b8d3SKris Buschelman   SSE_SCOPE_END;
39157cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
39167cf1b8d3SKris Buschelman }
39177cf1b8d3SKris Buschelman 
39183660e330SKris Buschelman #endif
39198f690400SShri Abhyankar 
39204a2ae208SSatish Balay #undef __FUNCT__
39214a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3"
3922dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
39234e2b4712SSatish Balay {
39244e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
39254e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
39266849ba73SBarry Smith   PetscErrorCode    ierr;
39275d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
39285d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3929d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3930d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
3931d9fead3dSBarry Smith   const PetscScalar *b;
39324e2b4712SSatish Balay 
39334e2b4712SSatish Balay   PetscFunctionBegin;
3934d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39351ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3936f1af5d2fSBarry Smith   t  = a->solve_work;
39374e2b4712SSatish Balay 
39384e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
39394e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
39404e2b4712SSatish Balay 
39414e2b4712SSatish Balay   /* forward solve the lower triangular */
39424e2b4712SSatish Balay   idx    = 3*(*r++);
3943f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
39444e2b4712SSatish Balay   for (i=1; i<n; i++) {
39454e2b4712SSatish Balay     v     = aa + 9*ai[i];
39464e2b4712SSatish Balay     vi    = aj + ai[i];
39474e2b4712SSatish Balay     nz    = diag[i] - ai[i];
39484e2b4712SSatish Balay     idx   = 3*(*r++);
3949f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
39504e2b4712SSatish Balay     while (nz--) {
39514e2b4712SSatish Balay       idx   = 3*(*vi++);
3952f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3953f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3954f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3955f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
39564e2b4712SSatish Balay       v += 9;
39574e2b4712SSatish Balay     }
39584e2b4712SSatish Balay     idx = 3*i;
3959f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
39604e2b4712SSatish Balay   }
39614e2b4712SSatish Balay   /* backward solve the upper triangular */
39624e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
39634e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
39644e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
39654e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
39664e2b4712SSatish Balay     idt  = 3*i;
3967f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
39684e2b4712SSatish Balay     while (nz--) {
39694e2b4712SSatish Balay       idx   = 3*(*vi++);
3970f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
3971f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
3972f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
3973f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
39744e2b4712SSatish Balay       v += 9;
39754e2b4712SSatish Balay     }
39764e2b4712SSatish Balay     idc = 3*(*c--);
39774e2b4712SSatish Balay     v   = aa + 9*diag[i];
3978f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
3979f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
3980f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
39814e2b4712SSatish Balay   }
39824e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
39834e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3984d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
39851ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3986dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
39874e2b4712SSatish Balay   PetscFunctionReturn(0);
39884e2b4712SSatish Balay }
39894e2b4712SSatish Balay 
39900c4413a7SShri Abhyankar #undef __FUNCT__
3991a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
3992a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
39930c4413a7SShri Abhyankar {
39940c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
39950c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
39960c4413a7SShri Abhyankar   PetscErrorCode    ierr;
39970c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
39980c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
39990c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
40000c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
40010c4413a7SShri Abhyankar   const PetscScalar *b;
40020c4413a7SShri Abhyankar 
40030c4413a7SShri Abhyankar   PetscFunctionBegin;
40040c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40050c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
40060c4413a7SShri Abhyankar   t  = a->solve_work;
40070c4413a7SShri Abhyankar 
40080c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
40090c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
40100c4413a7SShri Abhyankar 
40110c4413a7SShri Abhyankar   /* forward solve the lower triangular */
40120c4413a7SShri Abhyankar   idx    = 3*r[0];
40130c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
40140c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
40150c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
40160c4413a7SShri Abhyankar     vi    = aj + ai[i];
40170c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
40180c4413a7SShri Abhyankar     idx   = 3*r[i];
40190c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
40200c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
40210c4413a7SShri Abhyankar       idx   = 3*vi[m];
40220c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
40230c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
40240c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
40250c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40260c4413a7SShri Abhyankar       v += 9;
40270c4413a7SShri Abhyankar     }
40280c4413a7SShri Abhyankar     idx = 3*i;
40290c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
40300c4413a7SShri Abhyankar   }
40310c4413a7SShri Abhyankar   /* backward solve the upper triangular */
40320c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
40330c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
40340c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
40350c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
40360c4413a7SShri Abhyankar     idt  = 3*i;
40370c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
40380c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
40390c4413a7SShri Abhyankar       idx   = 3*vi[m];
40400c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
40410c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
40420c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
40430c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
40440c4413a7SShri Abhyankar       v += 9;
40450c4413a7SShri Abhyankar     }
40460c4413a7SShri Abhyankar     idc = 3*c[i];
40470c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
40480c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
40490c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
40500c4413a7SShri Abhyankar   }
40510c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
40520c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40530c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40540c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
40550c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
40560c4413a7SShri Abhyankar   PetscFunctionReturn(0);
40570c4413a7SShri Abhyankar }
40580c4413a7SShri Abhyankar 
405915091d37SBarry Smith /*
406015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
406115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
406215091d37SBarry Smith */
40634a2ae208SSatish Balay #undef __FUNCT__
40644a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
4065dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
406615091d37SBarry Smith {
406715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4068690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4069dfbe8321SBarry Smith   PetscErrorCode    ierr;
4070690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4071d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4072d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4073d9fead3dSBarry Smith   const PetscScalar *b;
4074690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
407515091d37SBarry Smith 
407615091d37SBarry Smith   PetscFunctionBegin;
4077d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40781ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
407915091d37SBarry Smith 
408015091d37SBarry Smith   /* forward solve the lower triangular */
408115091d37SBarry Smith   idx    = 0;
408215091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
408315091d37SBarry Smith   for (i=1; i<n; i++) {
408415091d37SBarry Smith     v     =  aa      + 9*ai[i];
408515091d37SBarry Smith     vi    =  aj      + ai[i];
408615091d37SBarry Smith     nz    =  diag[i] - ai[i];
408715091d37SBarry Smith     idx   +=  3;
4088f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
408915091d37SBarry Smith     while (nz--) {
409015091d37SBarry Smith       jdx   = 3*(*vi++);
409115091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4092f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4093f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4094f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
409515091d37SBarry Smith       v    += 9;
409615091d37SBarry Smith     }
4097f1af5d2fSBarry Smith     x[idx]   = s1;
4098f1af5d2fSBarry Smith     x[1+idx] = s2;
4099f1af5d2fSBarry Smith     x[2+idx] = s3;
410015091d37SBarry Smith   }
410115091d37SBarry Smith   /* backward solve the upper triangular */
410215091d37SBarry Smith   for (i=n-1; i>=0; i--){
410315091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
410415091d37SBarry Smith     vi   = aj + diag[i] + 1;
410515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
410615091d37SBarry Smith     idt  = 3*i;
4107f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4108f1af5d2fSBarry Smith     s3 = x[2+idt];
410915091d37SBarry Smith     while (nz--) {
411015091d37SBarry Smith       idx   = 3*(*vi++);
411115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4112f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4113f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4114f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
411515091d37SBarry Smith       v    += 9;
411615091d37SBarry Smith     }
411715091d37SBarry Smith     v        = aa +  9*diag[i];
4118f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4119f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4120f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
412115091d37SBarry Smith   }
412215091d37SBarry Smith 
4123d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41241ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4125dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
412615091d37SBarry Smith   PetscFunctionReturn(0);
412715091d37SBarry Smith }
412815091d37SBarry Smith 
4129cee9d6f2SShri Abhyankar #undef __FUNCT__
4130a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4131a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4132b2b2dd24SShri Abhyankar {
4133b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4134b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4135b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4136b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
4137b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4138b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4139b2b2dd24SShri Abhyankar     PetscScalar       *x;
4140b2b2dd24SShri Abhyankar     const PetscScalar *b;
4141b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4142b2b2dd24SShri Abhyankar 
4143b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4144b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4145b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4146b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4147b2b2dd24SShri Abhyankar     idx    = 0;
4148b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4149b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4150b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4151b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4152b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4153b2b2dd24SShri Abhyankar       idx   = bs*i;
4154b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4155b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4156b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4157b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4158b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4159b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4160b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4161b2b2dd24SShri Abhyankar 
4162b2b2dd24SShri Abhyankar           v   +=  bs2;
4163b2b2dd24SShri Abhyankar         }
4164b2b2dd24SShri Abhyankar 
4165b2b2dd24SShri Abhyankar        x[idx]   = s1;
4166b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4167b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4168b2b2dd24SShri Abhyankar     }
4169b2b2dd24SShri Abhyankar 
4170b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4171b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4172b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4173b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4174b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4175b2b2dd24SShri Abhyankar      idt = bs*i;
4176b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4177b2b2dd24SShri Abhyankar 
4178b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4179b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4180b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4181b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4182b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4183b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4184b2b2dd24SShri Abhyankar 
4185b2b2dd24SShri Abhyankar         v   +=  bs2;
4186b2b2dd24SShri Abhyankar     }
4187b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4188b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4189b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4190b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4191b2b2dd24SShri Abhyankar 
4192b2b2dd24SShri Abhyankar   }
4193b2b2dd24SShri Abhyankar 
4194b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4195b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4196b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4197b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4198b2b2dd24SShri Abhyankar }
4199b2b2dd24SShri Abhyankar 
4200b2b2dd24SShri Abhyankar #undef __FUNCT__
42014a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2"
4202dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
42034e2b4712SSatish Balay {
42044e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
42054e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
42066849ba73SBarry Smith   PetscErrorCode    ierr;
42075d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
42085d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4209d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4210d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4211d9fead3dSBarry Smith   const PetscScalar *b;
42124e2b4712SSatish Balay 
42134e2b4712SSatish Balay   PetscFunctionBegin;
4214d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4216f1af5d2fSBarry Smith   t  = a->solve_work;
42174e2b4712SSatish Balay 
42184e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
42194e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
42204e2b4712SSatish Balay 
42214e2b4712SSatish Balay   /* forward solve the lower triangular */
42224e2b4712SSatish Balay   idx    = 2*(*r++);
4223f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
42244e2b4712SSatish Balay   for (i=1; i<n; i++) {
42254e2b4712SSatish Balay     v     = aa + 4*ai[i];
42264e2b4712SSatish Balay     vi    = aj + ai[i];
42274e2b4712SSatish Balay     nz    = diag[i] - ai[i];
42284e2b4712SSatish Balay     idx   = 2*(*r++);
4229f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
42304e2b4712SSatish Balay     while (nz--) {
42314e2b4712SSatish Balay       idx   = 2*(*vi++);
4232f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4233f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4234f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
42354e2b4712SSatish Balay       v += 4;
42364e2b4712SSatish Balay     }
42374e2b4712SSatish Balay     idx = 2*i;
4238f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
42394e2b4712SSatish Balay   }
42404e2b4712SSatish Balay   /* backward solve the upper triangular */
42414e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
42424e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
42434e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
42444e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
42454e2b4712SSatish Balay     idt  = 2*i;
4246f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
42474e2b4712SSatish Balay     while (nz--) {
42484e2b4712SSatish Balay       idx   = 2*(*vi++);
4249f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4250f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4251f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
42524e2b4712SSatish Balay       v += 4;
42534e2b4712SSatish Balay     }
42544e2b4712SSatish Balay     idc = 2*(*c--);
42554e2b4712SSatish Balay     v   = aa + 4*diag[i];
4256f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4257f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
42584e2b4712SSatish Balay   }
42594e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
42604e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4261d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42621ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4263dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
42644e2b4712SSatish Balay   PetscFunctionReturn(0);
42654e2b4712SSatish Balay }
42664e2b4712SSatish Balay 
42670c4413a7SShri Abhyankar #undef __FUNCT__
4268a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4269a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
42700c4413a7SShri Abhyankar {
42710c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
42720c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
42730c4413a7SShri Abhyankar   PetscErrorCode    ierr;
42740c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
42750c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
42760c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
42770c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
42780c4413a7SShri Abhyankar   const PetscScalar *b;
42790c4413a7SShri Abhyankar 
42800c4413a7SShri Abhyankar   PetscFunctionBegin;
42810c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42820c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
42830c4413a7SShri Abhyankar   t  = a->solve_work;
42840c4413a7SShri Abhyankar 
42850c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
42860c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
42870c4413a7SShri Abhyankar 
42880c4413a7SShri Abhyankar   /* forward solve the lower triangular */
42890c4413a7SShri Abhyankar   idx    = 2*r[0];
42900c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
42910c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
42920c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
42930c4413a7SShri Abhyankar     vi    = aj + ai[i];
42940c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
42950c4413a7SShri Abhyankar     idx   = 2*r[i];
42960c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
42970c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
42980c4413a7SShri Abhyankar       jdx   = 2*vi[m];
42990c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
43000c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
43010c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
43020c4413a7SShri Abhyankar       v += 4;
43030c4413a7SShri Abhyankar     }
43040c4413a7SShri Abhyankar     idx = 2*i;
43050c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
43060c4413a7SShri Abhyankar   }
43070c4413a7SShri Abhyankar   /* backward solve the upper triangular */
43080c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
43090c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
43100c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
43110c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
43120c4413a7SShri Abhyankar     idt  = 2*i;
43130c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
43140c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
43150c4413a7SShri Abhyankar       idx   = 2*vi[m];
43160c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
43170c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
43180c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
43190c4413a7SShri Abhyankar       v += 4;
43200c4413a7SShri Abhyankar     }
43210c4413a7SShri Abhyankar     idc = 2*c[i];
43220c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
43230c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
43240c4413a7SShri Abhyankar   }
43250c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
43260c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
43270c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43280c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
43290c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
43300c4413a7SShri Abhyankar   PetscFunctionReturn(0);
43310c4413a7SShri Abhyankar }
43328f690400SShri Abhyankar 
433315091d37SBarry Smith /*
433415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
433515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
433615091d37SBarry Smith */
43374a2ae208SSatish Balay #undef __FUNCT__
43384a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
4339dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
434015091d37SBarry Smith {
434115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4342690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4343dfbe8321SBarry Smith   PetscErrorCode    ierr;
4344690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4345d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4346d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
4347d9fead3dSBarry Smith   const PetscScalar *b;
4348690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
434915091d37SBarry Smith 
435015091d37SBarry Smith   PetscFunctionBegin;
4351d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43521ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
435315091d37SBarry Smith 
435415091d37SBarry Smith   /* forward solve the lower triangular */
435515091d37SBarry Smith   idx    = 0;
435615091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
435715091d37SBarry Smith   for (i=1; i<n; i++) {
435815091d37SBarry Smith     v     =  aa      + 4*ai[i];
435915091d37SBarry Smith     vi    =  aj      + ai[i];
436015091d37SBarry Smith     nz    =  diag[i] - ai[i];
436115091d37SBarry Smith     idx   +=  2;
4362f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
436315091d37SBarry Smith     while (nz--) {
436415091d37SBarry Smith       jdx   = 2*(*vi++);
436515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
4366f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4367f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
436815091d37SBarry Smith       v    += 4;
436915091d37SBarry Smith     }
4370f1af5d2fSBarry Smith     x[idx]   = s1;
4371f1af5d2fSBarry Smith     x[1+idx] = s2;
437215091d37SBarry Smith   }
437315091d37SBarry Smith   /* backward solve the upper triangular */
437415091d37SBarry Smith   for (i=n-1; i>=0; i--){
437515091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
437615091d37SBarry Smith     vi   = aj + diag[i] + 1;
437715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
437815091d37SBarry Smith     idt  = 2*i;
4379f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
438015091d37SBarry Smith     while (nz--) {
438115091d37SBarry Smith       idx   = 2*(*vi++);
438215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
4383f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4384f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
438515091d37SBarry Smith       v    += 4;
438615091d37SBarry Smith     }
438715091d37SBarry Smith     v        = aa +  4*diag[i];
4388f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
4389f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
439015091d37SBarry Smith   }
439115091d37SBarry Smith 
4392d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
43931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4394dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
439515091d37SBarry Smith   PetscFunctionReturn(0);
439615091d37SBarry Smith }
439715091d37SBarry Smith 
4398cee9d6f2SShri Abhyankar #undef __FUNCT__
4399a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4400a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4401b2b2dd24SShri Abhyankar {
4402b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4403b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4404b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4405b2b2dd24SShri Abhyankar     PetscInt          jdx;
4406b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4407b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4408b2b2dd24SShri Abhyankar     const PetscScalar *b;
4409b2b2dd24SShri Abhyankar 
4410b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4411b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4412b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4413b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4414b2b2dd24SShri Abhyankar     idx    = 0;
4415b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4416b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4417b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4418b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4419b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4420b2b2dd24SShri Abhyankar        idx  = 2*i;
4421b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4422b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4423b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4424b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4425b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4426b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4427b2b2dd24SShri Abhyankar            v   +=  4;
4428b2b2dd24SShri Abhyankar         }
4429b2b2dd24SShri Abhyankar        x[idx]   = s1;
4430b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4431b2b2dd24SShri Abhyankar     }
4432b2b2dd24SShri Abhyankar 
4433b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4434b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4435b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4436b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4437b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4438b2b2dd24SShri Abhyankar      idt = 2*i;
4439b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4440b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4441b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4442b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4443b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4444b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4445b2b2dd24SShri Abhyankar          v    += 4;
4446b2b2dd24SShri Abhyankar     }
4447b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4448b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4449b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4450b2b2dd24SShri Abhyankar   }
4451b2b2dd24SShri Abhyankar 
4452b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4453b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4454b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4455b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4456b2b2dd24SShri Abhyankar }
4457b2b2dd24SShri Abhyankar 
4458b2b2dd24SShri Abhyankar #undef __FUNCT__
44594a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1"
4460dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
44614e2b4712SSatish Balay {
44624e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
44634e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
44646849ba73SBarry Smith   PetscErrorCode ierr;
44655d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
44665d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
44673f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
446887828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
44694e2b4712SSatish Balay 
44704e2b4712SSatish Balay   PetscFunctionBegin;
44714e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
44724e2b4712SSatish Balay 
44731ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
44741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4475f1af5d2fSBarry Smith   t  = a->solve_work;
44764e2b4712SSatish Balay 
44774e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
44784e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
44794e2b4712SSatish Balay 
44804e2b4712SSatish Balay   /* forward solve the lower triangular */
4481f1af5d2fSBarry Smith   t[0] = b[*r++];
44824e2b4712SSatish Balay   for (i=1; i<n; i++) {
44834e2b4712SSatish Balay     v     = aa + ai[i];
44844e2b4712SSatish Balay     vi    = aj + ai[i];
44854e2b4712SSatish Balay     nz    = diag[i] - ai[i];
4486f1af5d2fSBarry Smith     s1  = b[*r++];
44874e2b4712SSatish Balay     while (nz--) {
4488f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
44894e2b4712SSatish Balay     }
4490f1af5d2fSBarry Smith     t[i] = s1;
44914e2b4712SSatish Balay   }
44924e2b4712SSatish Balay   /* backward solve the upper triangular */
44934e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
44944e2b4712SSatish Balay     v    = aa + diag[i] + 1;
44954e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
44964e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4497f1af5d2fSBarry Smith     s1 = t[i];
44984e2b4712SSatish Balay     while (nz--) {
4499f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
45004e2b4712SSatish Balay     }
4501f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
45024e2b4712SSatish Balay   }
45034e2b4712SSatish Balay 
45044e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45054e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
45061ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
45071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4508dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
45094e2b4712SSatish Balay   PetscFunctionReturn(0);
45104e2b4712SSatish Balay }
451115091d37SBarry Smith /*
451215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
451315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
451415091d37SBarry Smith */
45154a2ae208SSatish Balay #undef __FUNCT__
45164a2ae208SSatish Balay #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
4517dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
451815091d37SBarry Smith {
451915091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
4520690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
4521dfbe8321SBarry Smith   PetscErrorCode ierr;
4522690b6cddSBarry Smith   PetscInt       *diag = a->diag;
452315091d37SBarry Smith   MatScalar      *aa=a->a;
452487828ca2SBarry Smith   PetscScalar    *x,*b;
452587828ca2SBarry Smith   PetscScalar    s1,x1;
452615091d37SBarry Smith   MatScalar      *v;
4527690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
452815091d37SBarry Smith 
452915091d37SBarry Smith   PetscFunctionBegin;
45301ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
45311ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
453215091d37SBarry Smith 
453315091d37SBarry Smith   /* forward solve the lower triangular */
453415091d37SBarry Smith   idx    = 0;
453515091d37SBarry Smith   x[0]   = b[0];
453615091d37SBarry Smith   for (i=1; i<n; i++) {
453715091d37SBarry Smith     v     =  aa      + ai[i];
453815091d37SBarry Smith     vi    =  aj      + ai[i];
453915091d37SBarry Smith     nz    =  diag[i] - ai[i];
454015091d37SBarry Smith     idx   +=  1;
4541f1af5d2fSBarry Smith     s1  =  b[idx];
454215091d37SBarry Smith     while (nz--) {
454315091d37SBarry Smith       jdx   = *vi++;
454415091d37SBarry Smith       x1    = x[jdx];
4545f1af5d2fSBarry Smith       s1 -= v[0]*x1;
454615091d37SBarry Smith       v    += 1;
454715091d37SBarry Smith     }
4548f1af5d2fSBarry Smith     x[idx]   = s1;
454915091d37SBarry Smith   }
455015091d37SBarry Smith   /* backward solve the upper triangular */
455115091d37SBarry Smith   for (i=n-1; i>=0; i--){
455215091d37SBarry Smith     v    = aa + diag[i] + 1;
455315091d37SBarry Smith     vi   = aj + diag[i] + 1;
455415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
455515091d37SBarry Smith     idt  = i;
4556f1af5d2fSBarry Smith     s1 = x[idt];
455715091d37SBarry Smith     while (nz--) {
455815091d37SBarry Smith       idx   = *vi++;
455915091d37SBarry Smith       x1    = x[idx];
4560f1af5d2fSBarry Smith       s1 -= v[0]*x1;
456115091d37SBarry Smith       v    += 1;
456215091d37SBarry Smith     }
456315091d37SBarry Smith     v        = aa +  diag[i];
4564f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
456515091d37SBarry Smith   }
45661ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
45671ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4568dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
456915091d37SBarry Smith   PetscFunctionReturn(0);
457015091d37SBarry Smith }
45714e2b4712SSatish Balay 
45724e2b4712SSatish Balay /* ----------------------------------------------------------------*/
457316a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
45746bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
4575ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
45766bce7ff8SHong Zhang 
45776bce7ff8SHong Zhang #undef __FUNCT__
45786bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
45796bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
45806bce7ff8SHong Zhang {
45816bce7ff8SHong Zhang   Mat            C=B;
45826bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
45836bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
45846bce7ff8SHong Zhang   PetscErrorCode ierr;
45856bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
45866bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
45876bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
4588b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
4589914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
4590914a18a2SHong Zhang   MatScalar      *v_work;
4591ae3d28f0SHong Zhang   PetscTruth     col_identity,row_identity,both_identity;
45926bce7ff8SHong Zhang 
45936bce7ff8SHong Zhang   PetscFunctionBegin;
45946bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
45956bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
4596ae3d28f0SHong Zhang 
4597fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
4598fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
45996bce7ff8SHong Zhang   ics  = ic;
46006bce7ff8SHong Zhang 
4601914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
4602fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
4603914a18a2SHong Zhang 
46046bce7ff8SHong Zhang   for (i=0; i<n; i++){
46056bce7ff8SHong Zhang     /* zero rtmp */
46066bce7ff8SHong Zhang     /* L part */
46076bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
46086bce7ff8SHong Zhang     bjtmp = bj + bi[i];
4609914a18a2SHong Zhang     for  (j=0; j<nz; j++){
4610914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
4611914a18a2SHong Zhang     }
46126bce7ff8SHong Zhang 
46136bce7ff8SHong Zhang     /* U part */
46141a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
46151a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
46161a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
46171a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
46181a83e813SShri Abhyankar     }
46191a83e813SShri Abhyankar 
46201a83e813SShri Abhyankar     /* load in initial (unfactored row) */
46211a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
46221a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
46231a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
46241a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
46251a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
46261a83e813SShri Abhyankar     }
46271a83e813SShri Abhyankar 
46281a83e813SShri Abhyankar     /* elimination */
46291a83e813SShri Abhyankar     bjtmp = bj + bi[i];
46301a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
46311a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
46321a83e813SShri Abhyankar       row = bjtmp[k];
46331a83e813SShri Abhyankar       pc = rtmp + bs2*row;
46341a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
46351a83e813SShri Abhyankar       if (flg) {
46361a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
46371a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
46381a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
46391a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
46401a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
46411a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
46421a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
46431a83e813SShri Abhyankar         }
46441a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
46451a83e813SShri Abhyankar       }
46461a83e813SShri Abhyankar     }
46471a83e813SShri Abhyankar 
46481a83e813SShri Abhyankar     /* finished row so stick it into b->a */
46491a83e813SShri Abhyankar     /* L part */
46501a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
46511a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
46521a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
46531a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
46541a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
46551a83e813SShri Abhyankar     }
46561a83e813SShri Abhyankar 
46571a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
46581a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
46591a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
46601a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
46611a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
46621a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
46631a83e813SShri Abhyankar 
46641a83e813SShri Abhyankar     /* U part */
46651a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
46661a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
46671a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
46681a83e813SShri Abhyankar     for (j=0; j<nz; j++){
46691a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
46701a83e813SShri Abhyankar     }
46711a83e813SShri Abhyankar   }
46721a83e813SShri Abhyankar 
46731a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
4674fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
46751a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
46761a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
46771a83e813SShri Abhyankar 
4678ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4679ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
4680ae3d28f0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
4681ae3d28f0SHong Zhang   if (both_identity){
4682a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
4683ae3d28f0SHong Zhang   } else {
4684a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
4685ae3d28f0SHong Zhang   }
4686*8499736aSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct;
4687ae3d28f0SHong Zhang 
46881a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
46891a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
46901a83e813SShri Abhyankar   PetscFunctionReturn(0);
46911a83e813SShri Abhyankar }
46921a83e813SShri Abhyankar 
46936bce7ff8SHong Zhang /*
46946bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
469516a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
469616a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
46976bce7ff8SHong Zhang */
4698c0c7eb62SShri Abhyankar 
46996bce7ff8SHong Zhang #undef __FUNCT__
47006bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
47016bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
47026bce7ff8SHong Zhang {
47036bce7ff8SHong Zhang 
47046bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
47056bce7ff8SHong Zhang   PetscErrorCode     ierr;
470616a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
470735aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
470835aa4fcfSShri Abhyankar 
470935aa4fcfSShri Abhyankar   PetscFunctionBegin;
471035aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
471135aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
471235aa4fcfSShri Abhyankar 
471335aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
471435aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
471535aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
471635aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
471735aa4fcfSShri Abhyankar   if (!b->diag){
471835aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
471935aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
472035aa4fcfSShri Abhyankar   }
472135aa4fcfSShri Abhyankar   bdiag = b->diag;
472235aa4fcfSShri Abhyankar 
472335aa4fcfSShri Abhyankar   if (n > 0) {
472435aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
472535aa4fcfSShri Abhyankar   }
472635aa4fcfSShri Abhyankar 
472735aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
472835aa4fcfSShri Abhyankar   bi = b->i;
472935aa4fcfSShri Abhyankar   bj = b->j;
473035aa4fcfSShri Abhyankar 
473135aa4fcfSShri Abhyankar   /* L part */
473235aa4fcfSShri Abhyankar   bi[0] = 0;
473335aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
473435aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
473535aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
473635aa4fcfSShri Abhyankar     aj = a->j + ai[i];
473735aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
473835aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
473935aa4fcfSShri Abhyankar     }
474035aa4fcfSShri Abhyankar   }
474135aa4fcfSShri Abhyankar 
474235aa4fcfSShri Abhyankar   /* U part */
474335aa4fcfSShri Abhyankar   bi_temp = bi[n];
474435aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
474535aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
474635aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
474735aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
474835aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
474935aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
475035aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
475135aa4fcfSShri Abhyankar     }
475235aa4fcfSShri Abhyankar     /* diag[i] */
475335aa4fcfSShri Abhyankar     *bj = i; bj++;
475435aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
475535aa4fcfSShri Abhyankar   }
475635aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
475735aa4fcfSShri Abhyankar }
475835aa4fcfSShri Abhyankar 
475935aa4fcfSShri Abhyankar #undef __FUNCT__
476016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
476116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
476216a2bf60SHong Zhang {
476316a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
476416a2bf60SHong Zhang   IS                 isicol;
476516a2bf60SHong Zhang   PetscErrorCode     ierr;
476616a2bf60SHong Zhang   const PetscInt     *r,*ic;
47677fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
476816a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
476916a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
477016a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
47717fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
477216a2bf60SHong Zhang   PetscReal          f;
477316a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
477416a2bf60SHong Zhang   PetscBT            lnkbt;
477516a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
477616a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
477716a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
477816a2bf60SHong Zhang   PetscTruth         missing;
47797fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
478016a2bf60SHong Zhang 
478116a2bf60SHong Zhang   PetscFunctionBegin;
478216a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
478316a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
478416a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
478516a2bf60SHong Zhang 
478616a2bf60SHong Zhang   f             = info->fill;
478716a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
478816a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
478916a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
479016a2bf60SHong Zhang 
479116a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
479216a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
47937fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
479416a2bf60SHong Zhang 
47957fa3a6a0SHong Zhang   if (!levels && both_identity) {
479616a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
479716a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
4798ae3d28f0SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
479935aa4fcfSShri Abhyankar 
480035aa4fcfSShri Abhyankar     fact->factor = MAT_FACTOR_ILU;
480135aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
480235aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
480335aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
480435aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
480535aa4fcfSShri Abhyankar     b->row           = isrow;
480635aa4fcfSShri Abhyankar     b->col           = iscol;
480735aa4fcfSShri Abhyankar     b->icol          = isicol;
480835aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
480935aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
481035aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
481135aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
481235aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
481335aa4fcfSShri Abhyankar   }
481435aa4fcfSShri Abhyankar 
481535aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
481635aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
481735aa4fcfSShri Abhyankar 
481835aa4fcfSShri Abhyankar   /* get new row pointers */
481935aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
482035aa4fcfSShri Abhyankar   bi[0] = 0;
482135aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
482235aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
482335aa4fcfSShri Abhyankar   bdiag[0]  = 0;
482435aa4fcfSShri Abhyankar 
4825fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
482635aa4fcfSShri Abhyankar 
482735aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
482835aa4fcfSShri Abhyankar   nlnk = n + 1;
482935aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
483035aa4fcfSShri Abhyankar 
483135aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
483235aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
483335aa4fcfSShri Abhyankar   current_space = free_space;
483435aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
483535aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
483635aa4fcfSShri Abhyankar 
483735aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
483835aa4fcfSShri Abhyankar     nzi = 0;
483935aa4fcfSShri Abhyankar     /* copy current row into linked list */
484035aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
484135aa4fcfSShri Abhyankar     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
484235aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
484335aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
484435aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
484535aa4fcfSShri Abhyankar     nzi += nlnk;
484635aa4fcfSShri Abhyankar 
484735aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
484835aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
484935aa4fcfSShri Abhyankar       fm = n;
485035aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
485135aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
485235aa4fcfSShri Abhyankar       lnk[fm]    = i;
485335aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
485435aa4fcfSShri Abhyankar       nzi++; dcount++;
485535aa4fcfSShri Abhyankar     }
485635aa4fcfSShri Abhyankar 
485735aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
485835aa4fcfSShri Abhyankar     nzbd = 0;
485935aa4fcfSShri Abhyankar     prow = lnk[n];
486035aa4fcfSShri Abhyankar     while (prow < i) {
486135aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
486235aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
486335aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
486435aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
486535aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
486635aa4fcfSShri Abhyankar       nzi += nlnk;
486735aa4fcfSShri Abhyankar       prow = lnk[prow];
486835aa4fcfSShri Abhyankar       nzbd++;
486935aa4fcfSShri Abhyankar     }
487035aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
487135aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
487235aa4fcfSShri Abhyankar 
487335aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
487435aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
487535aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
487635aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
487735aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
487835aa4fcfSShri Abhyankar       reallocs++;
487935aa4fcfSShri Abhyankar     }
488035aa4fcfSShri Abhyankar 
488135aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
488235aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
488335aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
488435aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
488535aa4fcfSShri Abhyankar 
488635aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
488735aa4fcfSShri Abhyankar     if (*(bj_ptr[i]+bdiag[i]) != i) {
488835aa4fcfSShri Abhyankar       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
488935aa4fcfSShri Abhyankar     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
489035aa4fcfSShri Abhyankar     }
489135aa4fcfSShri Abhyankar 
489235aa4fcfSShri Abhyankar     current_space->array           += nzi;
489335aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
489435aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
489535aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
489635aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
489735aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
489835aa4fcfSShri Abhyankar   }
489935aa4fcfSShri Abhyankar 
490035aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
490135aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
490235aa4fcfSShri Abhyankar 
490335aa4fcfSShri Abhyankar   /* destroy list of free space and other temporary arrays */
490435aa4fcfSShri Abhyankar   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
490535aa4fcfSShri Abhyankar 
490635aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
490735aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
490835aa4fcfSShri Abhyankar 
490935aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
491035aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
4911fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
491235aa4fcfSShri Abhyankar 
491335aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
491435aa4fcfSShri Abhyankar   {
491535aa4fcfSShri Abhyankar     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
491635aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
491735aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
491835aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
491935aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
492035aa4fcfSShri Abhyankar     if (diagonal_fill) {
492135aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
492235aa4fcfSShri Abhyankar     }
492335aa4fcfSShri Abhyankar   }
492435aa4fcfSShri Abhyankar #endif
492535aa4fcfSShri Abhyankar 
492635aa4fcfSShri Abhyankar   /* put together the new matrix */
492735aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
492835aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
492935aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
493035aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
493135aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
493235aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
493335aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
493435aa4fcfSShri Abhyankar   b->j          = bj;
493535aa4fcfSShri Abhyankar   b->i          = bi;
493635aa4fcfSShri Abhyankar   b->diag       = bdiag;
493735aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
493835aa4fcfSShri Abhyankar   b->ilen       = 0;
493935aa4fcfSShri Abhyankar   b->imax       = 0;
494035aa4fcfSShri Abhyankar   b->row        = isrow;
494135aa4fcfSShri Abhyankar   b->col        = iscol;
494235aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
494335aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
494435aa4fcfSShri Abhyankar   b->icol       = isicol;
494535aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
494635aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
494735aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
494835aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
494935aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
4950ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
4951ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
4952ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
4953ae3d28f0SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
495435aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
495535aa4fcfSShri Abhyankar }
495635aa4fcfSShri Abhyankar 
495735aa4fcfSShri Abhyankar 
49584e2b4712SSatish Balay /*
49594e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
49604e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
49614e2b4712SSatish Balay    Not a good example of code reuse.
49624e2b4712SSatish Balay */
49634a2ae208SSatish Balay #undef __FUNCT__
49644a2ae208SSatish Balay #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
49650481f469SBarry Smith PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
49664e2b4712SSatish Balay {
49674e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
49684e2b4712SSatish Balay   IS             isicol;
49696849ba73SBarry Smith   PetscErrorCode ierr;
49705d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
49715d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
4972a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
4973d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
497441df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
4975329f5518SBarry Smith   PetscReal      f;
4976c0c7eb62SShri Abhyankar   PetscTruth     newdatastruct = PETSC_FALSE;
49774e2b4712SSatish Balay 
49784e2b4712SSatish Balay   PetscFunctionBegin;
497916a2bf60SHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_new",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
498016a2bf60SHong Zhang   if (newdatastruct){
498116a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
498216a2bf60SHong Zhang     PetscFunctionReturn(0);
498316a2bf60SHong Zhang   }
498416a2bf60SHong Zhang 
49856bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
49866bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
49876bce7ff8SHong Zhang 
4988435faa5fSBarry Smith   f             = info->fill;
4989690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
4990690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
49914c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
499216a2bf60SHong Zhang 
4993667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
4994667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
49957d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
4996309c388cSBarry Smith 
499741df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
499816a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
49996bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
50006bce7ff8SHong Zhang 
5001719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
5002ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
5003bb3d539aSBarry Smith     b->row       = isrow;
5004bb3d539aSBarry Smith     b->col       = iscol;
5005bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5006bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5007bb3d539aSBarry Smith     b->icol      = isicol;
5008bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5009b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
50106bce7ff8SHong Zhang     PetscFunctionReturn(0);
50116bce7ff8SHong Zhang   }
50126bce7ff8SHong Zhang 
50136bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
50144e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
50154e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
50164e2b4712SSatish Balay 
50174e2b4712SSatish Balay     /* get new row pointers */
5018690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
50194e2b4712SSatish Balay     ainew[0] = 0;
50204e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
5021690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
5022690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
50234e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
5024690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
50254e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
5026690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
50274e2b4712SSatish Balay     /* im is level for each filled value */
5028690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
50294e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
5030690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
50314e2b4712SSatish Balay     dloc[0]  = 0;
50324e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
5033435faa5fSBarry Smith 
5034435faa5fSBarry Smith       /* copy prow into linked list */
50354e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
50363b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
50374e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
50384e2b4712SSatish Balay       fill[n]    = n;
5039435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
50404e2b4712SSatish Balay       while (nz--) {
50414e2b4712SSatish Balay 	fm  = n;
50424e2b4712SSatish Balay 	idx = ic[*xi++];
50434e2b4712SSatish Balay 	do {
50444e2b4712SSatish Balay 	  m  = fm;
50454e2b4712SSatish Balay 	  fm = fill[m];
50464e2b4712SSatish Balay 	} while (fm < idx);
50474e2b4712SSatish Balay 	fill[m]   = idx;
50484e2b4712SSatish Balay 	fill[idx] = fm;
50494e2b4712SSatish Balay 	im[idx]   = 0;
50504e2b4712SSatish Balay       }
5051435faa5fSBarry Smith 
5052435faa5fSBarry Smith       /* make sure diagonal entry is included */
5053435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
5054435faa5fSBarry Smith 	fm = n;
5055435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
5056435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5057435faa5fSBarry Smith 	fill[fm]   = prow;
5058435faa5fSBarry Smith 	im[prow]   = 0;
5059435faa5fSBarry Smith 	nzf++;
5060335d9088SBarry Smith 	dcount++;
5061435faa5fSBarry Smith       }
5062435faa5fSBarry Smith 
50634e2b4712SSatish Balay       nzi = 0;
50644e2b4712SSatish Balay       row = fill[n];
50654e2b4712SSatish Balay       while (row < prow) {
50664e2b4712SSatish Balay 	incrlev = im[row] + 1;
50674e2b4712SSatish Balay 	nz      = dloc[row];
5068435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
50694e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
50704e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
50714e2b4712SSatish Balay 	fm      = row;
50724e2b4712SSatish Balay 	while (nnz-- > 0) {
50734e2b4712SSatish Balay 	  idx = *xi++;
50744e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
50754e2b4712SSatish Balay 	    flev++;
50764e2b4712SSatish Balay 	    continue;
50774e2b4712SSatish Balay 	  }
50784e2b4712SSatish Balay 	  do {
50794e2b4712SSatish Balay 	    m  = fm;
50804e2b4712SSatish Balay 	    fm = fill[m];
50814e2b4712SSatish Balay 	  } while (fm < idx);
50824e2b4712SSatish Balay 	  if (fm != idx) {
50834e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
50844e2b4712SSatish Balay 	    fill[m]   = idx;
50854e2b4712SSatish Balay 	    fill[idx] = fm;
50864e2b4712SSatish Balay 	    fm        = idx;
50874e2b4712SSatish Balay 	    nzf++;
5088ecf371e4SBarry Smith 	  } else {
50894e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
50904e2b4712SSatish Balay 	  }
50914e2b4712SSatish Balay 	  flev++;
50924e2b4712SSatish Balay 	}
50934e2b4712SSatish Balay 	row = fill[row];
50944e2b4712SSatish Balay 	nzi++;
50954e2b4712SSatish Balay       }
50964e2b4712SSatish Balay       /* copy new filled row into permanent storage */
50974e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
50984e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
5099ecf371e4SBarry Smith 
5100ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
5101ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5102ecf371e4SBarry Smith 	/* just double the memory each time */
5103690b6cddSBarry Smith 	PetscInt maxadd = jmax;
5104ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
51054e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
51064e2b4712SSatish Balay 	jmax += maxadd;
5107ecf371e4SBarry Smith 
5108ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
51095d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
51105d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5111606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
51125d0c19d7SBarry Smith 	ajnew = xitmp;
51135d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
51145d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5115606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
51165d0c19d7SBarry Smith 	ajfill = xitmp;
5117eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
51184e2b4712SSatish Balay       }
51195d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
51204e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
51214e2b4712SSatish Balay       dloc[prow]  = nzi;
51224e2b4712SSatish Balay       fm          = fill[n];
51234e2b4712SSatish Balay       while (nzf--) {
51245d0c19d7SBarry Smith 	*xitmp++ = fm;
51254e2b4712SSatish Balay 	*flev++ = im[fm];
51264e2b4712SSatish Balay 	fm      = fill[fm];
51274e2b4712SSatish Balay       }
5128435faa5fSBarry Smith       /* make sure row has diagonal entry */
5129435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
513077431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
51312401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5132435faa5fSBarry Smith       }
51334e2b4712SSatish Balay     }
5134606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
51354e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
51364e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5137606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
5138606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
51394e2b4712SSatish Balay 
51406cf91177SBarry Smith #if defined(PETSC_USE_INFO)
51414e2b4712SSatish Balay     {
5142329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5143ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5144ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5145ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5146ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5147335d9088SBarry Smith       if (diagonal_fill) {
5148ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5149335d9088SBarry Smith       }
51504e2b4712SSatish Balay     }
515163ba0a88SBarry Smith #endif
51524e2b4712SSatish Balay 
51534e2b4712SSatish Balay     /* put together the new matrix */
5154719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5155719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5156ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
5157e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
5158e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
51597c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
5160a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
51614e2b4712SSatish Balay     b->j          = ajnew;
51624e2b4712SSatish Balay     b->i          = ainew;
51634e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
51644e2b4712SSatish Balay     b->diag       = dloc;
51657f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
51664e2b4712SSatish Balay     b->ilen       = 0;
51674e2b4712SSatish Balay     b->imax       = 0;
51684e2b4712SSatish Balay     b->row        = isrow;
51694e2b4712SSatish Balay     b->col        = iscol;
5170bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5171c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5172c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5173e51c0b9cSSatish Balay     b->icol       = isicol;
517487828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
51754e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
51764e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
5177719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
51784e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
51794e2b4712SSatish Balay 
5180ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
5181ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
5182ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
51836bce7ff8SHong Zhang 
518441df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
51858661488fSKris Buschelman   PetscFunctionReturn(0);
51868661488fSKris Buschelman }
51878661488fSKris Buschelman 
5188732ee342SKris Buschelman #undef __FUNCT__
51897e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5190dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
51917e7071cdSKris Buschelman {
519212272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
519312272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
51945a9542e3SKris Buschelman   PetscFunctionBegin;
51957cf1b8d3SKris Buschelman   /* Undo Column scaling */
51967cf1b8d3SKris Buschelman /*    while (nz--) { */
51977cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
51987cf1b8d3SKris Buschelman /*    } */
5199c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
5200c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
52017cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
52027cf1b8d3SKris Buschelman }
52037cf1b8d3SKris Buschelman 
52047cf1b8d3SKris Buschelman #undef __FUNCT__
52057cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5206dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
52077cf1b8d3SKris Buschelman {
52087cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5209b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
52102aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
52115a9542e3SKris Buschelman   PetscFunctionBegin;
52120b9da03eSKris Buschelman   /* Is this really necessary? */
521320235379SKris Buschelman   while (nz--) {
52140b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
52157e7071cdSKris Buschelman   }
5216c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
52177e7071cdSKris Buschelman   PetscFunctionReturn(0);
52187e7071cdSKris Buschelman }
52197e7071cdSKris Buschelman 
5220732ee342SKris Buschelman 
5221