xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 06e38f1d1dda9d815f70edc0143bb77eead08adb)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
3a4005a5dSBarry Smith 
44e2b4712SSatish Balay /*
54e2b4712SSatish Balay     Factorization code for BAIJ format.
64e2b4712SSatish Balay */
74e2b4712SSatish Balay 
87c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
9c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
1016a2bf60SHong Zhang #include "petscbt.h"
1116a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
124e2b4712SSatish Balay 
134a2ae208SSatish Balay #undef __FUNCT__
14*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
15*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
16f1af5d2fSBarry Smith {
17f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
18dfbe8321SBarry Smith   PetscErrorCode ierr;
19690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
20690b6cddSBarry Smith   PetscInt       *diag = a->diag;
21f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
2287828ca2SBarry Smith   PetscScalar    s1,*x,*b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
261ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
521ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
59*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
60*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode ierr;
64690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
65690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
66f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
6787828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
6887828ca2SBarry Smith   PetscScalar    *x,*b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
1111ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1186929473cSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct"
1196929473cSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
1206929473cSShri Abhyankar {
1216929473cSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1226929473cSShri Abhyankar   PetscErrorCode ierr;
1236929473cSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1246929473cSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
1256929473cSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
1266929473cSShri Abhyankar   MatScalar      *aa=a->a,*v;
1276929473cSShri Abhyankar   PetscScalar    s1,s2,x1,x2;
1286929473cSShri Abhyankar   PetscScalar    *x,*b;
1296929473cSShri Abhyankar 
1306929473cSShri Abhyankar   PetscFunctionBegin;
1316929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1326929473cSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1336929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1346929473cSShri Abhyankar 
1356929473cSShri Abhyankar   /* forward solve the U^T */
1366929473cSShri Abhyankar   idx = 0;
1376929473cSShri Abhyankar   for (i=0; i<n; i++) {
1386929473cSShri Abhyankar     v     = aa + bs2*diag[i];
1396929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1406929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1416929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1426929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1436929473cSShri Abhyankar     v -= bs2;
1446929473cSShri Abhyankar 
1456929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
1466929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
1476929473cSShri Abhyankar     for(j=0;j>-nz;j--){
1486929473cSShri Abhyankar       oidx = bs*vi[j];
1496929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
1506929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
1516929473cSShri Abhyankar       v  -= bs2;
1526929473cSShri Abhyankar     }
1536929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
1546929473cSShri Abhyankar     idx += bs;
1556929473cSShri Abhyankar   }
1566929473cSShri Abhyankar   /* backward solve the L^T */
1576929473cSShri Abhyankar   for (i=n-1; i>=0; i--){
1586929473cSShri Abhyankar     v    = aa + bs2*ai[i];
1596929473cSShri Abhyankar     vi   = aj + ai[i];
1606929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
1616929473cSShri Abhyankar     idt  = bs*i;
1626929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
1636929473cSShri Abhyankar     for(j=0;j<nz;j++){
1646929473cSShri Abhyankar       idx   = bs*vi[j];
1656929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
1666929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
1676929473cSShri Abhyankar       v += bs2;
1686929473cSShri Abhyankar     }
1696929473cSShri Abhyankar   }
1706929473cSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
1716929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1726929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1736929473cSShri Abhyankar   PetscFunctionReturn(0);
1746929473cSShri Abhyankar }
1756929473cSShri Abhyankar 
1766929473cSShri Abhyankar #undef __FUNCT__
177*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
178*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179f1af5d2fSBarry Smith {
180f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
181dfbe8321SBarry Smith   PetscErrorCode ierr;
182690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
183690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
184f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
18587828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
18687828ca2SBarry Smith   PetscScalar    *x,*b;
187f1af5d2fSBarry Smith 
188f1af5d2fSBarry Smith   PetscFunctionBegin;
189ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
1901ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192f1af5d2fSBarry Smith 
193f1af5d2fSBarry Smith   /* forward solve the U^T */
194f1af5d2fSBarry Smith   idx = 0;
195f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
196f1af5d2fSBarry Smith 
197f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
198f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
199ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203f1af5d2fSBarry Smith     v += 9;
204f1af5d2fSBarry Smith 
205f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
206f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
207f1af5d2fSBarry Smith     while (nz--) {
208f1af5d2fSBarry Smith       oidx = 3*(*vi++);
209f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212f1af5d2fSBarry Smith       v  += 9;
213f1af5d2fSBarry Smith     }
214f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215f1af5d2fSBarry Smith     idx += 3;
216f1af5d2fSBarry Smith   }
217f1af5d2fSBarry Smith   /* backward solve the L^T */
218f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
219f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
220f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
221f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
222f1af5d2fSBarry Smith     idt  = 3*i;
223f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224f1af5d2fSBarry Smith     while (nz--) {
225f1af5d2fSBarry Smith       idx   = 3*(*vi--);
226f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229f1af5d2fSBarry Smith       v -= 9;
230f1af5d2fSBarry Smith     }
231f1af5d2fSBarry Smith   }
2321ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235f1af5d2fSBarry Smith   PetscFunctionReturn(0);
236f1af5d2fSBarry Smith }
237f1af5d2fSBarry Smith 
2384a2ae208SSatish Balay #undef __FUNCT__
2398499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct"
2408499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
2418499736aSShri Abhyankar {
2428499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
2438499736aSShri Abhyankar   PetscErrorCode ierr;
2448499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2458499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
2468499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
2478499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
2488499736aSShri Abhyankar   PetscScalar    s1,s2,s3,x1,x2,x3;
2498499736aSShri Abhyankar   PetscScalar    *x,*b;
2508499736aSShri Abhyankar 
2518499736aSShri Abhyankar   PetscFunctionBegin;
2528499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
2538499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
2548499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2558499736aSShri Abhyankar 
2568499736aSShri Abhyankar   /* forward solve the U^T */
2578499736aSShri Abhyankar   idx = 0;
2588499736aSShri Abhyankar   for (i=0; i<n; i++) {
2598499736aSShri Abhyankar     v     = aa + bs2*diag[i];
2608499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
2618499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
2628499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
2638499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
2648499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
2658499736aSShri Abhyankar     v -= bs2;
2668499736aSShri Abhyankar 
2678499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
2688499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
2698499736aSShri Abhyankar     for(j=0;j>-nz;j--){
2708499736aSShri Abhyankar       oidx = bs*vi[j];
2718499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
2728499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
2738499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
2748499736aSShri Abhyankar       v  -= bs2;
2758499736aSShri Abhyankar     }
2768499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
2778499736aSShri Abhyankar     idx += bs;
2788499736aSShri Abhyankar   }
2798499736aSShri Abhyankar   /* backward solve the L^T */
2808499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
2818499736aSShri Abhyankar     v    = aa + bs2*ai[i];
2828499736aSShri Abhyankar     vi   = aj + ai[i];
2838499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
2848499736aSShri Abhyankar     idt  = bs*i;
2858499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
2868499736aSShri Abhyankar     for(j=0;j<nz;j++){
2878499736aSShri Abhyankar       idx   = bs*vi[j];
2888499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
2898499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
2908499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
2918499736aSShri Abhyankar       v += bs2;
2928499736aSShri Abhyankar     }
2938499736aSShri Abhyankar   }
2948499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
2958499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2968499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2978499736aSShri Abhyankar   PetscFunctionReturn(0);
2988499736aSShri Abhyankar }
2998499736aSShri Abhyankar 
3008499736aSShri Abhyankar #undef __FUNCT__
301*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
302*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303f1af5d2fSBarry Smith {
304f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
305dfbe8321SBarry Smith   PetscErrorCode ierr;
306690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
307690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
308f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
30987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
31087828ca2SBarry Smith   PetscScalar    *x,*b;
311f1af5d2fSBarry Smith 
312f1af5d2fSBarry Smith   PetscFunctionBegin;
313ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3141ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316f1af5d2fSBarry Smith 
317f1af5d2fSBarry Smith   /* forward solve the U^T */
318f1af5d2fSBarry Smith   idx = 0;
319f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
322f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
323ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328f1af5d2fSBarry Smith     v += 16;
329f1af5d2fSBarry Smith 
330f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
331f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
332f1af5d2fSBarry Smith     while (nz--) {
333f1af5d2fSBarry Smith       oidx = 4*(*vi++);
334f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338f1af5d2fSBarry Smith       v  += 16;
339f1af5d2fSBarry Smith     }
340f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341f1af5d2fSBarry Smith     idx += 4;
342f1af5d2fSBarry Smith   }
343f1af5d2fSBarry Smith   /* backward solve the L^T */
344f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
345f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
346f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
347f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
348f1af5d2fSBarry Smith     idt  = 4*i;
349f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350f1af5d2fSBarry Smith     while (nz--) {
351f1af5d2fSBarry Smith       idx   = 4*(*vi--);
352f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356f1af5d2fSBarry Smith       v -= 16;
357f1af5d2fSBarry Smith     }
358f1af5d2fSBarry Smith   }
3591ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
3601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362f1af5d2fSBarry Smith   PetscFunctionReturn(0);
363f1af5d2fSBarry Smith }
364f1af5d2fSBarry Smith 
3654a2ae208SSatish Balay #undef __FUNCT__
3668499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3678499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3688499736aSShri Abhyankar {
3698499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
3708499736aSShri Abhyankar   PetscErrorCode ierr;
3718499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
3728499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
3738499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
3748499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
3758499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
3768499736aSShri Abhyankar   PetscScalar    *x,*b;
3778499736aSShri Abhyankar 
3788499736aSShri Abhyankar   PetscFunctionBegin;
3798499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
3808499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
3818499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3828499736aSShri Abhyankar 
3838499736aSShri Abhyankar   /* forward solve the U^T */
3848499736aSShri Abhyankar   idx = 0;
3858499736aSShri Abhyankar   for (i=0; i<n; i++) {
3868499736aSShri Abhyankar     v     = aa + bs2*diag[i];
3878499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
3888499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
3898499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
3908499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
3918499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
3928499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
3938499736aSShri Abhyankar     v -= bs2;
3948499736aSShri Abhyankar 
3958499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
3968499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
3978499736aSShri Abhyankar     for(j=0;j>-nz;j--){
3988499736aSShri Abhyankar       oidx = bs*vi[j];
3998499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4008499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4018499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4028499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4038499736aSShri Abhyankar       v  -= bs2;
4048499736aSShri Abhyankar     }
4058499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
4068499736aSShri Abhyankar     idx += bs;
4078499736aSShri Abhyankar   }
4088499736aSShri Abhyankar   /* backward solve the L^T */
4098499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
4108499736aSShri Abhyankar     v    = aa + bs2*ai[i];
4118499736aSShri Abhyankar     vi   = aj + ai[i];
4128499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
4138499736aSShri Abhyankar     idt  = bs*i;
4148499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
4158499736aSShri Abhyankar     for(j=0;j<nz;j++){
4168499736aSShri Abhyankar       idx   = bs*vi[j];
4178499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4188499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4198499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4208499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4218499736aSShri Abhyankar       v += bs2;
4228499736aSShri Abhyankar     }
4238499736aSShri Abhyankar   }
4248499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4258499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4268499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4278499736aSShri Abhyankar   PetscFunctionReturn(0);
4288499736aSShri Abhyankar }
4298499736aSShri Abhyankar 
4308499736aSShri Abhyankar #undef __FUNCT__
431*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
432*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433f1af5d2fSBarry Smith {
434f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
435dfbe8321SBarry Smith   PetscErrorCode ierr;
436690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
437690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
438f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
43987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
44087828ca2SBarry Smith   PetscScalar    *x,*b;
441f1af5d2fSBarry Smith 
442f1af5d2fSBarry Smith   PetscFunctionBegin;
443ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
4441ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
4451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446f1af5d2fSBarry Smith 
447f1af5d2fSBarry Smith   /* forward solve the U^T */
448f1af5d2fSBarry Smith   idx = 0;
449f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
450f1af5d2fSBarry Smith 
451f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
452f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
453ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459f1af5d2fSBarry Smith     v += 25;
460f1af5d2fSBarry Smith 
461f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
462f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
463f1af5d2fSBarry Smith     while (nz--) {
464f1af5d2fSBarry Smith       oidx = 5*(*vi++);
465f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470f1af5d2fSBarry Smith       v  += 25;
471f1af5d2fSBarry Smith     }
472f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473f1af5d2fSBarry Smith     idx += 5;
474f1af5d2fSBarry Smith   }
475f1af5d2fSBarry Smith   /* backward solve the L^T */
476f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
477f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
478f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
479f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
480f1af5d2fSBarry Smith     idt  = 5*i;
481f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482f1af5d2fSBarry Smith     while (nz--) {
483f1af5d2fSBarry Smith       idx   = 5*(*vi--);
484f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489f1af5d2fSBarry Smith       v -= 25;
490f1af5d2fSBarry Smith     }
491f1af5d2fSBarry Smith   }
4921ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
4931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495f1af5d2fSBarry Smith   PetscFunctionReturn(0);
496f1af5d2fSBarry Smith }
497f1af5d2fSBarry Smith 
4984a2ae208SSatish Balay #undef __FUNCT__
4998499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct"
5008499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
5018499736aSShri Abhyankar {
5028499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5038499736aSShri Abhyankar   PetscErrorCode ierr;
5048499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5058499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
5068499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
5078499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
5088499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
5098499736aSShri Abhyankar   PetscScalar    *x,*b;
5108499736aSShri Abhyankar 
5118499736aSShri Abhyankar   PetscFunctionBegin;
5128499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5138499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5148499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5158499736aSShri Abhyankar 
5168499736aSShri Abhyankar   /* forward solve the U^T */
5178499736aSShri Abhyankar   idx = 0;
5188499736aSShri Abhyankar   for (i=0; i<n; i++) {
5198499736aSShri Abhyankar     v     = aa + bs2*diag[i];
5208499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
5218499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
5228499736aSShri Abhyankar     x5 = x[4+idx];
5238499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
5248499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
5258499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
5268499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
5278499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
5288499736aSShri Abhyankar     v -= bs2;
5298499736aSShri Abhyankar 
5308499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
5318499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
5328499736aSShri Abhyankar     for(j=0;j>-nz;j--){
5338499736aSShri Abhyankar       oidx = bs*vi[j];
5348499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5358499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5368499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5378499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5388499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5398499736aSShri Abhyankar       v  -= bs2;
5408499736aSShri Abhyankar     }
5418499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
5428499736aSShri Abhyankar     idx += bs;
5438499736aSShri Abhyankar   }
5448499736aSShri Abhyankar   /* backward solve the L^T */
5458499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
5468499736aSShri Abhyankar     v    = aa + bs2*ai[i];
5478499736aSShri Abhyankar     vi   = aj + ai[i];
5488499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
5498499736aSShri Abhyankar     idt  = bs*i;
5508499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
5518499736aSShri Abhyankar     for(j=0;j<nz;j++){
5528499736aSShri Abhyankar       idx   = bs*vi[j];
5538499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5548499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5558499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5568499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5578499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5588499736aSShri Abhyankar       v += bs2;
5598499736aSShri Abhyankar     }
5608499736aSShri Abhyankar   }
5618499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
5628499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5638499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5648499736aSShri Abhyankar   PetscFunctionReturn(0);
5658499736aSShri Abhyankar }
5668499736aSShri Abhyankar 
5678499736aSShri Abhyankar #undef __FUNCT__
568*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
569*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570f1af5d2fSBarry Smith {
571f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
572dfbe8321SBarry Smith   PetscErrorCode ierr;
573690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
574690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
575f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
57687828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
57787828ca2SBarry Smith   PetscScalar    *x,*b;
578f1af5d2fSBarry Smith 
579f1af5d2fSBarry Smith   PetscFunctionBegin;
580ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
5811ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
5821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583f1af5d2fSBarry Smith 
584f1af5d2fSBarry Smith   /* forward solve the U^T */
585f1af5d2fSBarry Smith   idx = 0;
586f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
587f1af5d2fSBarry Smith 
588f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
589f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
590ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591ef66eb69SBarry Smith     x6    = x[5+idx];
592f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598f1af5d2fSBarry Smith     v += 36;
599f1af5d2fSBarry Smith 
600f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
601f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
602f1af5d2fSBarry Smith     while (nz--) {
603f1af5d2fSBarry Smith       oidx = 6*(*vi++);
604f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610f1af5d2fSBarry Smith       v  += 36;
611f1af5d2fSBarry Smith     }
612f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613f1af5d2fSBarry Smith     x[5+idx] = s6;
614f1af5d2fSBarry Smith     idx += 6;
615f1af5d2fSBarry Smith   }
616f1af5d2fSBarry Smith   /* backward solve the L^T */
617f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
618f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
619f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
620f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
621f1af5d2fSBarry Smith     idt  = 6*i;
622f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623f1af5d2fSBarry Smith     s6 = x[5+idt];
624f1af5d2fSBarry Smith     while (nz--) {
625f1af5d2fSBarry Smith       idx   = 6*(*vi--);
626f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632f1af5d2fSBarry Smith       v -= 36;
633f1af5d2fSBarry Smith     }
634f1af5d2fSBarry Smith   }
6351ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
6361ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638f1af5d2fSBarry Smith   PetscFunctionReturn(0);
639f1af5d2fSBarry Smith }
640f1af5d2fSBarry Smith 
6414a2ae208SSatish Balay #undef __FUNCT__
6428499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct"
6438499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
6448499736aSShri Abhyankar {
6458499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
6468499736aSShri Abhyankar   PetscErrorCode ierr;
6478499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
6488499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
6498499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
6508499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
6518499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
6528499736aSShri Abhyankar   PetscScalar    *x,*b;
6538499736aSShri Abhyankar 
6548499736aSShri Abhyankar   PetscFunctionBegin;
6558499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
6568499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
6578499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
6588499736aSShri Abhyankar 
6598499736aSShri Abhyankar   /* forward solve the U^T */
6608499736aSShri Abhyankar   idx = 0;
6618499736aSShri Abhyankar   for (i=0; i<n; i++) {
6628499736aSShri Abhyankar     v     = aa + bs2*diag[i];
6638499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
6648499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
6658499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
6668499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
6678499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
6688499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
6698499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
6708499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
6718499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
6728499736aSShri Abhyankar     v -= bs2;
6738499736aSShri Abhyankar 
6748499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
6758499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
6768499736aSShri Abhyankar     for(j=0;j>-nz;j--){
6778499736aSShri Abhyankar       oidx = bs*vi[j];
6788499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
6798499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
6808499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
6818499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
6828499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
6838499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
6848499736aSShri Abhyankar       v  -= bs2;
6858499736aSShri Abhyankar     }
6868499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
6878499736aSShri Abhyankar     x[5+idx] = s6;
6888499736aSShri Abhyankar     idx += bs;
6898499736aSShri Abhyankar   }
6908499736aSShri Abhyankar   /* backward solve the L^T */
6918499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
6928499736aSShri Abhyankar     v    = aa + bs2*ai[i];
6938499736aSShri Abhyankar     vi   = aj + ai[i];
6948499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
6958499736aSShri Abhyankar     idt  = bs*i;
6968499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
6978499736aSShri Abhyankar     s6   = x[5+idt];
6988499736aSShri Abhyankar     for(j=0;j<nz;j++){
6998499736aSShri Abhyankar       idx   = bs*vi[j];
7008499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7018499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7028499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7038499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7048499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7058499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7068499736aSShri Abhyankar       v += bs2;
7078499736aSShri Abhyankar     }
7088499736aSShri Abhyankar   }
7098499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7108499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
7118499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
7128499736aSShri Abhyankar   PetscFunctionReturn(0);
7138499736aSShri Abhyankar }
7148499736aSShri Abhyankar 
7158499736aSShri Abhyankar #undef __FUNCT__
716*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
717*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718f1af5d2fSBarry Smith {
719f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
720dfbe8321SBarry Smith   PetscErrorCode ierr;
721690b6cddSBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
722690b6cddSBarry Smith   PetscInt       *diag = a->diag,oidx;
723f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
72487828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
72587828ca2SBarry Smith   PetscScalar    *x,*b;
726f1af5d2fSBarry Smith 
727f1af5d2fSBarry Smith   PetscFunctionBegin;
728ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
7291ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
7301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731f1af5d2fSBarry Smith 
732f1af5d2fSBarry Smith   /* forward solve the U^T */
733f1af5d2fSBarry Smith   idx = 0;
734f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
735f1af5d2fSBarry Smith 
736f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
737f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
738ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
740f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747f1af5d2fSBarry Smith     v += 49;
748f1af5d2fSBarry Smith 
749f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
750f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
751f1af5d2fSBarry Smith     while (nz--) {
752f1af5d2fSBarry Smith       oidx = 7*(*vi++);
753f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760f1af5d2fSBarry Smith       v  += 49;
761f1af5d2fSBarry Smith     }
762f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
764f1af5d2fSBarry Smith     idx += 7;
765f1af5d2fSBarry Smith   }
766f1af5d2fSBarry Smith   /* backward solve the L^T */
767f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
768f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
769f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
770f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
771f1af5d2fSBarry Smith     idt  = 7*i;
772f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
774f1af5d2fSBarry Smith     while (nz--) {
775f1af5d2fSBarry Smith       idx   = 7*(*vi--);
776f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783f1af5d2fSBarry Smith       v -= 49;
784f1af5d2fSBarry Smith     }
785f1af5d2fSBarry Smith   }
7861ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
7871ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789f1af5d2fSBarry Smith   PetscFunctionReturn(0);
790f1af5d2fSBarry Smith }
7918499736aSShri Abhyankar #undef __FUNCT__
7928499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct"
7938499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
7948499736aSShri Abhyankar {
7958499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
7968499736aSShri Abhyankar   PetscErrorCode ierr;
7978499736aSShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
7988499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
7998499736aSShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
8008499736aSShri Abhyankar   MatScalar      *aa=a->a,*v;
8018499736aSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
8028499736aSShri Abhyankar   PetscScalar    *x,*b;
8038499736aSShri Abhyankar 
8048499736aSShri Abhyankar   PetscFunctionBegin;
8058499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
8068499736aSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8078499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8088499736aSShri Abhyankar 
8098499736aSShri Abhyankar   /* forward solve the U^T */
8108499736aSShri Abhyankar   idx = 0;
8118499736aSShri Abhyankar   for (i=0; i<n; i++) {
8128499736aSShri Abhyankar     v     = aa + bs2*diag[i];
8138499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
8148499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
8158499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
8168499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
8178499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
8188499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
8198499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
8208499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
8218499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
8228499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
8238499736aSShri Abhyankar     v -= bs2;
8248499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
8258499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
8268499736aSShri Abhyankar     for(j=0;j>-nz;j--){
8278499736aSShri Abhyankar       oidx = bs*vi[j];
8288499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8298499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8308499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8318499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8328499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8338499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8348499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8358499736aSShri Abhyankar       v  -= bs2;
8368499736aSShri Abhyankar     }
8378499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
8388499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
8398499736aSShri Abhyankar     idx += bs;
8408499736aSShri Abhyankar   }
8418499736aSShri Abhyankar   /* backward solve the L^T */
8428499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
8438499736aSShri Abhyankar     v    = aa + bs2*ai[i];
8448499736aSShri Abhyankar     vi   = aj + ai[i];
8458499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
8468499736aSShri Abhyankar     idt  = bs*i;
8478499736aSShri Abhyankar     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
8488499736aSShri Abhyankar     s6   = x[5+idt];  s7 = x[6+idt];
8498499736aSShri Abhyankar     for(j=0;j<nz;j++){
8508499736aSShri Abhyankar       idx   = bs*vi[j];
8518499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8528499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8538499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8548499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8558499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8568499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8578499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8588499736aSShri Abhyankar       v += bs2;
8598499736aSShri Abhyankar     }
8608499736aSShri Abhyankar   }
8618499736aSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
8628499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
8638499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
8648499736aSShri Abhyankar   PetscFunctionReturn(0);
8658499736aSShri Abhyankar }
866f1af5d2fSBarry Smith 
867f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
8684a2ae208SSatish Balay #undef __FUNCT__
869*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
870*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871f1af5d2fSBarry Smith {
872f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
873f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
8746849ba73SBarry Smith   PetscErrorCode ierr;
8755d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
8765d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
877690b6cddSBarry Smith   PetscInt       *diag = a->diag;
878f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
87987828ca2SBarry Smith   PetscScalar    s1,*x,*b,*t;
880f1af5d2fSBarry Smith 
881f1af5d2fSBarry Smith   PetscFunctionBegin;
8821ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
8831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
884f1af5d2fSBarry Smith   t  = a->solve_work;
885f1af5d2fSBarry Smith 
886f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
887f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
888f1af5d2fSBarry Smith 
889f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
890f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
891f1af5d2fSBarry Smith     t[i] = b[c[i]];
892f1af5d2fSBarry Smith   }
893f1af5d2fSBarry Smith 
894f1af5d2fSBarry Smith   /* forward solve the U^T */
895f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
896f1af5d2fSBarry Smith 
897f1af5d2fSBarry Smith     v     = aa + diag[i];
898f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
899f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
900f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
901f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
902f1af5d2fSBarry Smith     while (nz--) {
903f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
904f1af5d2fSBarry Smith     }
905f1af5d2fSBarry Smith     t[i]   = s1;
906f1af5d2fSBarry Smith   }
907f1af5d2fSBarry Smith   /* backward solve the L^T */
908f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
909f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
910f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
911f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
912f1af5d2fSBarry Smith     s1   = t[i];
913f1af5d2fSBarry Smith     while (nz--) {
914f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
915f1af5d2fSBarry Smith     }
916f1af5d2fSBarry Smith   }
917f1af5d2fSBarry Smith 
918f1af5d2fSBarry Smith   /* copy t into x according to permutation */
919f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
920f1af5d2fSBarry Smith     x[r[i]]   = t[i];
921f1af5d2fSBarry Smith   }
922f1af5d2fSBarry Smith 
923f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
924f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
9251ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
9261ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
927dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
928f1af5d2fSBarry Smith   PetscFunctionReturn(0);
929f1af5d2fSBarry Smith }
930f1af5d2fSBarry Smith 
9314a2ae208SSatish Balay #undef __FUNCT__
932*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
933*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
934f1af5d2fSBarry Smith {
935f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
936f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
9376849ba73SBarry Smith   PetscErrorCode ierr;
9385d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
9395d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
940690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
941f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
94287828ca2SBarry Smith   PetscScalar    s1,s2,x1,x2;
94387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
944f1af5d2fSBarry Smith 
945f1af5d2fSBarry Smith   PetscFunctionBegin;
9461ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
9471ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
948f1af5d2fSBarry Smith   t  = a->solve_work;
949f1af5d2fSBarry Smith 
950f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
951f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
952f1af5d2fSBarry Smith 
953f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
954f1af5d2fSBarry Smith   ii = 0;
955f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
956f1af5d2fSBarry Smith     ic      = 2*c[i];
957f1af5d2fSBarry Smith     t[ii]   = b[ic];
958f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
959f1af5d2fSBarry Smith     ii += 2;
960f1af5d2fSBarry Smith   }
961f1af5d2fSBarry Smith 
962f1af5d2fSBarry Smith   /* forward solve the U^T */
963f1af5d2fSBarry Smith   idx = 0;
964f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
965f1af5d2fSBarry Smith 
966f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
967f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
968f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
969f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
970f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
971f1af5d2fSBarry Smith     v += 4;
972f1af5d2fSBarry Smith 
973f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
974f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
975f1af5d2fSBarry Smith     while (nz--) {
976f1af5d2fSBarry Smith       oidx = 2*(*vi++);
977f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
978f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
979f1af5d2fSBarry Smith       v  += 4;
980f1af5d2fSBarry Smith     }
981f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
982f1af5d2fSBarry Smith     idx += 2;
983f1af5d2fSBarry Smith   }
984f1af5d2fSBarry Smith   /* backward solve the L^T */
985f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
986f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
987f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
988f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
989f1af5d2fSBarry Smith     idt  = 2*i;
990f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
991f1af5d2fSBarry Smith     while (nz--) {
992f1af5d2fSBarry Smith       idx   = 2*(*vi--);
993f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
994f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
995f1af5d2fSBarry Smith       v -= 4;
996f1af5d2fSBarry Smith     }
997f1af5d2fSBarry Smith   }
998f1af5d2fSBarry Smith 
999f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1000f1af5d2fSBarry Smith   ii = 0;
1001f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1002f1af5d2fSBarry Smith     ir      = 2*r[i];
1003f1af5d2fSBarry Smith     x[ir]   = t[ii];
1004f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1005f1af5d2fSBarry Smith     ii += 2;
1006f1af5d2fSBarry Smith   }
1007f1af5d2fSBarry Smith 
1008f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1009f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
10101ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
10111ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1012dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1013f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1014f1af5d2fSBarry Smith }
1015f1af5d2fSBarry Smith 
10164a2ae208SSatish Balay #undef __FUNCT__
101732121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_newdatastruct"
101832121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
101932121132SShri Abhyankar {
102032121132SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
102132121132SShri Abhyankar   PetscErrorCode ierr;
102232121132SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
102332121132SShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
102432121132SShri Abhyankar   const PetscInt *r,*c,*rout,*cout;
102532121132SShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
102632121132SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
102732121132SShri Abhyankar   MatScalar      *aa=a->a,*v;
102832121132SShri Abhyankar   PetscScalar    s1,s2,x1,x2;
102932121132SShri Abhyankar   PetscScalar    *x,*b,*t;
103032121132SShri Abhyankar 
103132121132SShri Abhyankar   PetscFunctionBegin;
103232121132SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
103332121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
103432121132SShri Abhyankar   t = a->solve_work;
103532121132SShri Abhyankar 
103632121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
103732121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
103832121132SShri Abhyankar 
103932121132SShri Abhyankar   /* copy b into temp work space according to permutation */
104032121132SShri Abhyankar   for(i=0;i<n;i++){
104132121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
104232121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1];
104332121132SShri Abhyankar   }
104432121132SShri Abhyankar 
104532121132SShri Abhyankar   /* forward solve the U^T */
104632121132SShri Abhyankar   idx = 0;
104732121132SShri Abhyankar   for (i=0; i<n; i++) {
104832121132SShri Abhyankar     v     = aa + bs2*diag[i];
104932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
105032121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx];
105132121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
105232121132SShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
105332121132SShri Abhyankar     v -= bs2;
105432121132SShri Abhyankar 
105532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
105632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
105732121132SShri Abhyankar     for(j=0;j>-nz;j--){
105832121132SShri Abhyankar       oidx = bs*vi[j];
105932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2;
106032121132SShri Abhyankar       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
106132121132SShri Abhyankar       v  -= bs2;
106232121132SShri Abhyankar     }
106332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
106432121132SShri Abhyankar     idx += bs;
106532121132SShri Abhyankar   }
106632121132SShri Abhyankar   /* backward solve the L^T */
106732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
106832121132SShri Abhyankar     v    = aa + bs2*ai[i];
106932121132SShri Abhyankar     vi   = aj + ai[i];
107032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
107132121132SShri Abhyankar     idt  = bs*i;
107232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];
107332121132SShri Abhyankar     for(j=0;j<nz;j++){
107432121132SShri Abhyankar       idx   = bs*vi[j];
107532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2;
107632121132SShri Abhyankar       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
107732121132SShri Abhyankar       v += bs2;
107832121132SShri Abhyankar     }
107932121132SShri Abhyankar   }
108032121132SShri Abhyankar 
108132121132SShri Abhyankar   /* copy t into x according to permutation */
108232121132SShri Abhyankar   for(i=0;i<n;i++){
108332121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
108432121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1];
108532121132SShri Abhyankar   }
108632121132SShri Abhyankar 
108732121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
108832121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
108932121132SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
109032121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109132121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
109232121132SShri Abhyankar   PetscFunctionReturn(0);
109332121132SShri Abhyankar }
109432121132SShri Abhyankar 
109532121132SShri Abhyankar #undef __FUNCT__
1096*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
1097*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1098f1af5d2fSBarry Smith {
1099f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1100f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
11016849ba73SBarry Smith   PetscErrorCode ierr;
11025d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
11035d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1104690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1105f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
110687828ca2SBarry Smith   PetscScalar    s1,s2,s3,x1,x2,x3;
110787828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1108f1af5d2fSBarry Smith 
1109f1af5d2fSBarry Smith   PetscFunctionBegin;
11101ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
11111ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1112f1af5d2fSBarry Smith   t  = a->solve_work;
1113f1af5d2fSBarry Smith 
1114f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1115f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1116f1af5d2fSBarry Smith 
1117f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1118f1af5d2fSBarry Smith   ii = 0;
1119f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1120f1af5d2fSBarry Smith     ic      = 3*c[i];
1121f1af5d2fSBarry Smith     t[ii]   = b[ic];
1122f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1123f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1124f1af5d2fSBarry Smith     ii += 3;
1125f1af5d2fSBarry Smith   }
1126f1af5d2fSBarry Smith 
1127f1af5d2fSBarry Smith   /* forward solve the U^T */
1128f1af5d2fSBarry Smith   idx = 0;
1129f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1130f1af5d2fSBarry Smith 
1131f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
1132f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1133f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1134f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1135f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1136f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1137f1af5d2fSBarry Smith     v += 9;
1138f1af5d2fSBarry Smith 
1139f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1140f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1141f1af5d2fSBarry Smith     while (nz--) {
1142f1af5d2fSBarry Smith       oidx = 3*(*vi++);
1143f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1144f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1145f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1146f1af5d2fSBarry Smith       v  += 9;
1147f1af5d2fSBarry Smith     }
1148f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1149f1af5d2fSBarry Smith     idx += 3;
1150f1af5d2fSBarry Smith   }
1151f1af5d2fSBarry Smith   /* backward solve the L^T */
1152f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1153f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
1154f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1155f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1156f1af5d2fSBarry Smith     idt  = 3*i;
1157f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1158f1af5d2fSBarry Smith     while (nz--) {
1159f1af5d2fSBarry Smith       idx   = 3*(*vi--);
1160f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1161f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1162f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1163f1af5d2fSBarry Smith       v -= 9;
1164f1af5d2fSBarry Smith     }
1165f1af5d2fSBarry Smith   }
1166f1af5d2fSBarry Smith 
1167f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1168f1af5d2fSBarry Smith   ii = 0;
1169f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1170f1af5d2fSBarry Smith     ir      = 3*r[i];
1171f1af5d2fSBarry Smith     x[ir]   = t[ii];
1172f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1173f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1174f1af5d2fSBarry Smith     ii += 3;
1175f1af5d2fSBarry Smith   }
1176f1af5d2fSBarry Smith 
1177f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1178f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
11791ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
11801ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1181dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1182f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1183f1af5d2fSBarry Smith }
1184f1af5d2fSBarry Smith 
11854a2ae208SSatish Balay #undef __FUNCT__
118632121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_newdatastruct"
118732121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
118832121132SShri Abhyankar {
118932121132SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
119032121132SShri Abhyankar   PetscErrorCode ierr;
119132121132SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
119232121132SShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
119332121132SShri Abhyankar   const PetscInt *r,*c,*rout,*cout;
119432121132SShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
119532121132SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
119632121132SShri Abhyankar   MatScalar      *aa=a->a,*v;
119732121132SShri Abhyankar   PetscScalar    s1,s2,s3,x1,x2,x3;
119832121132SShri Abhyankar   PetscScalar    *x,*b,*t;
119932121132SShri Abhyankar 
120032121132SShri Abhyankar   PetscFunctionBegin;
120132121132SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
120232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120332121132SShri Abhyankar   t = a->solve_work;
120432121132SShri Abhyankar 
120532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
120632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
120732121132SShri Abhyankar 
120832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
120932121132SShri Abhyankar   for(i=0;i<n;i++){
121032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
121132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
121232121132SShri Abhyankar   }
121332121132SShri Abhyankar 
121432121132SShri Abhyankar   /* forward solve the U^T */
121532121132SShri Abhyankar   idx = 0;
121632121132SShri Abhyankar   for (i=0; i<n; i++) {
121732121132SShri Abhyankar     v     = aa + bs2*diag[i];
121832121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
121932121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
122032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
122132121132SShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
122232121132SShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
122332121132SShri Abhyankar     v -= bs2;
122432121132SShri Abhyankar 
122532121132SShri Abhyankar     vi    = aj + diag[i] - 1;
122632121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
122732121132SShri Abhyankar     for(j=0;j>-nz;j--){
122832121132SShri Abhyankar       oidx = bs*vi[j];
122932121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
123032121132SShri Abhyankar       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
123132121132SShri Abhyankar       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
123232121132SShri Abhyankar       v  -= bs2;
123332121132SShri Abhyankar     }
123432121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
123532121132SShri Abhyankar     idx += bs;
123632121132SShri Abhyankar   }
123732121132SShri Abhyankar   /* backward solve the L^T */
123832121132SShri Abhyankar   for (i=n-1; i>=0; i--){
123932121132SShri Abhyankar     v    = aa + bs2*ai[i];
124032121132SShri Abhyankar     vi   = aj + ai[i];
124132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
124232121132SShri Abhyankar     idt  = bs*i;
124332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
124432121132SShri Abhyankar     for(j=0;j<nz;j++){
124532121132SShri Abhyankar       idx   = bs*vi[j];
124632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
124732121132SShri Abhyankar       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
124832121132SShri Abhyankar       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
124932121132SShri Abhyankar       v += bs2;
125032121132SShri Abhyankar     }
125132121132SShri Abhyankar   }
125232121132SShri Abhyankar 
125332121132SShri Abhyankar   /* copy t into x according to permutation */
125432121132SShri Abhyankar   for(i=0;i<n;i++){
125532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
125632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
125732121132SShri Abhyankar   }
125832121132SShri Abhyankar 
125932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
126032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
126132121132SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
126232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
126332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
126432121132SShri Abhyankar   PetscFunctionReturn(0);
126532121132SShri Abhyankar }
126632121132SShri Abhyankar 
126732121132SShri Abhyankar #undef __FUNCT__
1268*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
1269*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1270f1af5d2fSBarry Smith {
1271f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1272f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
12736849ba73SBarry Smith   PetscErrorCode ierr;
12745d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
12755d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1276690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1277f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
127887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
127987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1280f1af5d2fSBarry Smith 
1281f1af5d2fSBarry Smith   PetscFunctionBegin;
12821ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
12831ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1284f1af5d2fSBarry Smith   t  = a->solve_work;
1285f1af5d2fSBarry Smith 
1286f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1287f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1288f1af5d2fSBarry Smith 
1289f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1290f1af5d2fSBarry Smith   ii = 0;
1291f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1292f1af5d2fSBarry Smith     ic      = 4*c[i];
1293f1af5d2fSBarry Smith     t[ii]   = b[ic];
1294f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1295f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1296f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1297f1af5d2fSBarry Smith     ii += 4;
1298f1af5d2fSBarry Smith   }
1299f1af5d2fSBarry Smith 
1300f1af5d2fSBarry Smith   /* forward solve the U^T */
1301f1af5d2fSBarry Smith   idx = 0;
1302f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1303f1af5d2fSBarry Smith 
1304f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
1305f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1306f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1307f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1308f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1309f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1310f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1311f1af5d2fSBarry Smith     v += 16;
1312f1af5d2fSBarry Smith 
1313f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1314f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1315f1af5d2fSBarry Smith     while (nz--) {
1316f1af5d2fSBarry Smith       oidx = 4*(*vi++);
1317f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1318f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1319f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1320f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1321f1af5d2fSBarry Smith       v  += 16;
1322f1af5d2fSBarry Smith     }
1323f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1324f1af5d2fSBarry Smith     idx += 4;
1325f1af5d2fSBarry Smith   }
1326f1af5d2fSBarry Smith   /* backward solve the L^T */
1327f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1328f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
1329f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1330f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1331f1af5d2fSBarry Smith     idt  = 4*i;
1332f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1333f1af5d2fSBarry Smith     while (nz--) {
1334f1af5d2fSBarry Smith       idx   = 4*(*vi--);
1335f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1336f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1337f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1338f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1339f1af5d2fSBarry Smith       v -= 16;
1340f1af5d2fSBarry Smith     }
1341f1af5d2fSBarry Smith   }
1342f1af5d2fSBarry Smith 
1343f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1344f1af5d2fSBarry Smith   ii = 0;
1345f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1346f1af5d2fSBarry Smith     ir      = 4*r[i];
1347f1af5d2fSBarry Smith     x[ir]   = t[ii];
1348f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1349f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1350f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1351f1af5d2fSBarry Smith     ii += 4;
1352f1af5d2fSBarry Smith   }
1353f1af5d2fSBarry Smith 
1354f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1355f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
13561ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
13571ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1358dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1359f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1360f1af5d2fSBarry Smith }
1361f1af5d2fSBarry Smith 
13624a2ae208SSatish Balay #undef __FUNCT__
136332121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_newdatastruct"
136432121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
136532121132SShri Abhyankar {
136632121132SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
136732121132SShri Abhyankar   PetscErrorCode ierr;
136832121132SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
136932121132SShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
137032121132SShri Abhyankar   const PetscInt *r,*c,*rout,*cout;
137132121132SShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
137232121132SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
137332121132SShri Abhyankar   MatScalar      *aa=a->a,*v;
137432121132SShri Abhyankar   PetscScalar    s1,s2,s3,s4,x1,x2,x3,x4;
137532121132SShri Abhyankar   PetscScalar    *x,*b,*t;
137632121132SShri Abhyankar 
137732121132SShri Abhyankar   PetscFunctionBegin;
137832121132SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
137932121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
138032121132SShri Abhyankar   t = a->solve_work;
138132121132SShri Abhyankar 
138232121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
138332121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
138432121132SShri Abhyankar 
138532121132SShri Abhyankar   /* copy b into temp work space according to permutation */
138632121132SShri Abhyankar   for(i=0;i<n;i++){
138732121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
138832121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
138932121132SShri Abhyankar   }
139032121132SShri Abhyankar 
139132121132SShri Abhyankar   /* forward solve the U^T */
139232121132SShri Abhyankar   idx = 0;
139332121132SShri Abhyankar   for (i=0; i<n; i++) {
139432121132SShri Abhyankar     v     = aa + bs2*diag[i];
139532121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
139632121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
139732121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
139832121132SShri Abhyankar     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
139932121132SShri Abhyankar     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
140032121132SShri Abhyankar     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
140132121132SShri Abhyankar     v -= bs2;
140232121132SShri Abhyankar 
140332121132SShri Abhyankar     vi    = aj + diag[i] - 1;
140432121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
140532121132SShri Abhyankar     for(j=0;j>-nz;j--){
140632121132SShri Abhyankar       oidx = bs*vi[j];
140732121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
140832121132SShri Abhyankar       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
140932121132SShri Abhyankar       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
141032121132SShri Abhyankar       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
141132121132SShri Abhyankar       v  -= bs2;
141232121132SShri Abhyankar     }
141332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
141432121132SShri Abhyankar     idx += bs;
141532121132SShri Abhyankar   }
141632121132SShri Abhyankar   /* backward solve the L^T */
141732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
141832121132SShri Abhyankar     v    = aa + bs2*ai[i];
141932121132SShri Abhyankar     vi   = aj + ai[i];
142032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
142132121132SShri Abhyankar     idt  = bs*i;
142232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
142332121132SShri Abhyankar     for(j=0;j<nz;j++){
142432121132SShri Abhyankar       idx   = bs*vi[j];
142532121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
142632121132SShri Abhyankar       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
142732121132SShri Abhyankar       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
142832121132SShri Abhyankar       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
142932121132SShri Abhyankar       v += bs2;
143032121132SShri Abhyankar     }
143132121132SShri Abhyankar   }
143232121132SShri Abhyankar 
143332121132SShri Abhyankar   /* copy t into x according to permutation */
143432121132SShri Abhyankar   for(i=0;i<n;i++){
143532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
143632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
143732121132SShri Abhyankar   }
143832121132SShri Abhyankar 
143932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
144032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
144132121132SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
144232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
144332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
144432121132SShri Abhyankar   PetscFunctionReturn(0);
144532121132SShri Abhyankar }
144632121132SShri Abhyankar 
144732121132SShri Abhyankar #undef __FUNCT__
1448*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
1449*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1450f1af5d2fSBarry Smith {
1451f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1452f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
14536849ba73SBarry Smith   PetscErrorCode ierr;
14545d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
14555d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1456690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1457f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
145887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
145987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1460f1af5d2fSBarry Smith 
1461f1af5d2fSBarry Smith   PetscFunctionBegin;
14621ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
14631ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1464f1af5d2fSBarry Smith   t  = a->solve_work;
1465f1af5d2fSBarry Smith 
1466f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1467f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1468f1af5d2fSBarry Smith 
1469f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1470f1af5d2fSBarry Smith   ii = 0;
1471f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1472f1af5d2fSBarry Smith     ic      = 5*c[i];
1473f1af5d2fSBarry Smith     t[ii]   = b[ic];
1474f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1475f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1476f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1477f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1478f1af5d2fSBarry Smith     ii += 5;
1479f1af5d2fSBarry Smith   }
1480f1af5d2fSBarry Smith 
1481f1af5d2fSBarry Smith   /* forward solve the U^T */
1482f1af5d2fSBarry Smith   idx = 0;
1483f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1484f1af5d2fSBarry Smith 
1485f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
1486f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1487f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1488f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1489f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1490f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1491f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1492f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1493f1af5d2fSBarry Smith     v += 25;
1494f1af5d2fSBarry Smith 
1495f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1496f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1497f1af5d2fSBarry Smith     while (nz--) {
1498f1af5d2fSBarry Smith       oidx = 5*(*vi++);
1499f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1500f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1501f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1502f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1503f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1504f1af5d2fSBarry Smith       v  += 25;
1505f1af5d2fSBarry Smith     }
1506f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1507f1af5d2fSBarry Smith     idx += 5;
1508f1af5d2fSBarry Smith   }
1509f1af5d2fSBarry Smith   /* backward solve the L^T */
1510f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1511f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
1512f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1513f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1514f1af5d2fSBarry Smith     idt  = 5*i;
1515f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1516f1af5d2fSBarry Smith     while (nz--) {
1517f1af5d2fSBarry Smith       idx   = 5*(*vi--);
1518f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1519f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1520f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1521f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1522f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1523f1af5d2fSBarry Smith       v -= 25;
1524f1af5d2fSBarry Smith     }
1525f1af5d2fSBarry Smith   }
1526f1af5d2fSBarry Smith 
1527f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1528f1af5d2fSBarry Smith   ii = 0;
1529f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1530f1af5d2fSBarry Smith     ir      = 5*r[i];
1531f1af5d2fSBarry Smith     x[ir]   = t[ii];
1532f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1533f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1534f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1535f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1536f1af5d2fSBarry Smith     ii += 5;
1537f1af5d2fSBarry Smith   }
1538f1af5d2fSBarry Smith 
1539f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1540f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
15411ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
15421ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1543dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1544f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1545f1af5d2fSBarry Smith }
1546f1af5d2fSBarry Smith 
15474a2ae208SSatish Balay #undef __FUNCT__
154832121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_newdatastruct"
154932121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
155032121132SShri Abhyankar {
155132121132SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
155232121132SShri Abhyankar   PetscErrorCode ierr;
155332121132SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
155432121132SShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
155532121132SShri Abhyankar   const PetscInt *r,*c,*rout,*cout;
155632121132SShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
155732121132SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
155832121132SShri Abhyankar   MatScalar      *aa=a->a,*v;
155932121132SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
156032121132SShri Abhyankar   PetscScalar    *x,*b,*t;
156132121132SShri Abhyankar 
156232121132SShri Abhyankar   PetscFunctionBegin;
156332121132SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
156432121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
156532121132SShri Abhyankar   t = a->solve_work;
156632121132SShri Abhyankar 
156732121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
156832121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
156932121132SShri Abhyankar 
157032121132SShri Abhyankar   /* copy b into temp work space according to permutation */
157132121132SShri Abhyankar   for(i=0;i<n;i++){
157232121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
157332121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
157432121132SShri Abhyankar     t[ii+4] = b[ic+4];
157532121132SShri Abhyankar   }
157632121132SShri Abhyankar 
157732121132SShri Abhyankar   /* forward solve the U^T */
157832121132SShri Abhyankar   idx = 0;
157932121132SShri Abhyankar   for (i=0; i<n; i++) {
158032121132SShri Abhyankar     v     = aa + bs2*diag[i];
158132121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
158232121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
158332121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
158432121132SShri Abhyankar     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
158532121132SShri Abhyankar     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
158632121132SShri Abhyankar     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
158732121132SShri Abhyankar     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
158832121132SShri Abhyankar     v -= bs2;
158932121132SShri Abhyankar 
159032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
159132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
159232121132SShri Abhyankar     for(j=0;j>-nz;j--){
159332121132SShri Abhyankar       oidx = bs*vi[j];
159432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
159532121132SShri Abhyankar       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
159632121132SShri Abhyankar       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
159732121132SShri Abhyankar       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
159832121132SShri Abhyankar       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
159932121132SShri Abhyankar       v  -= bs2;
160032121132SShri Abhyankar     }
160132121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
160232121132SShri Abhyankar     idx += bs;
160332121132SShri Abhyankar   }
160432121132SShri Abhyankar   /* backward solve the L^T */
160532121132SShri Abhyankar   for (i=n-1; i>=0; i--){
160632121132SShri Abhyankar     v    = aa + bs2*ai[i];
160732121132SShri Abhyankar     vi   = aj + ai[i];
160832121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
160932121132SShri Abhyankar     idt  = bs*i;
161032121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
161132121132SShri Abhyankar     for(j=0;j<nz;j++){
161232121132SShri Abhyankar       idx   = bs*vi[j];
161332121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
161432121132SShri Abhyankar       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
161532121132SShri Abhyankar       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
161632121132SShri Abhyankar       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
161732121132SShri Abhyankar       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
161832121132SShri Abhyankar       v += bs2;
161932121132SShri Abhyankar     }
162032121132SShri Abhyankar   }
162132121132SShri Abhyankar 
162232121132SShri Abhyankar   /* copy t into x according to permutation */
162332121132SShri Abhyankar   for(i=0;i<n;i++){
162432121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
162532121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
162632121132SShri Abhyankar     x[ir+4] = t[ii+4];
162732121132SShri Abhyankar   }
162832121132SShri Abhyankar 
162932121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
163032121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
163132121132SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
163232121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
163332121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
163432121132SShri Abhyankar   PetscFunctionReturn(0);
163532121132SShri Abhyankar }
163632121132SShri Abhyankar 
163732121132SShri Abhyankar #undef __FUNCT__
1638*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
1639*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1640f1af5d2fSBarry Smith {
1641f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1642f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
16436849ba73SBarry Smith   PetscErrorCode ierr;
16445d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
16455d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1646690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1647f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
164887828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
164987828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1650f1af5d2fSBarry Smith 
1651f1af5d2fSBarry Smith   PetscFunctionBegin;
16521ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
16531ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1654f1af5d2fSBarry Smith   t  = a->solve_work;
1655f1af5d2fSBarry Smith 
1656f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1657f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1658f1af5d2fSBarry Smith 
1659f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1660f1af5d2fSBarry Smith   ii = 0;
1661f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1662f1af5d2fSBarry Smith     ic      = 6*c[i];
1663f1af5d2fSBarry Smith     t[ii]   = b[ic];
1664f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1665f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1666f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1667f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1668f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1669f1af5d2fSBarry Smith     ii += 6;
1670f1af5d2fSBarry Smith   }
1671f1af5d2fSBarry Smith 
1672f1af5d2fSBarry Smith   /* forward solve the U^T */
1673f1af5d2fSBarry Smith   idx = 0;
1674f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1675f1af5d2fSBarry Smith 
1676f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
1677f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1678f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1679f1af5d2fSBarry Smith     x6    = t[5+idx];
1680f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1681f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1682f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1683f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1684f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1685f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1686f1af5d2fSBarry Smith     v += 36;
1687f1af5d2fSBarry Smith 
1688f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1689f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1690f1af5d2fSBarry Smith     while (nz--) {
1691f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1692f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1693f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1694f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1695f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1696f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1697f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1698f1af5d2fSBarry Smith       v  += 36;
1699f1af5d2fSBarry Smith     }
1700f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1701f1af5d2fSBarry Smith     t[5+idx] = s6;
1702f1af5d2fSBarry Smith     idx += 6;
1703f1af5d2fSBarry Smith   }
1704f1af5d2fSBarry Smith   /* backward solve the L^T */
1705f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1706f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1707f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1708f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1709f1af5d2fSBarry Smith     idt  = 6*i;
1710f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1711f1af5d2fSBarry Smith     s6 = t[5+idt];
1712f1af5d2fSBarry Smith     while (nz--) {
1713f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1714f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1715f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1716f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1717f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1718f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1719f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1720f1af5d2fSBarry Smith       v -= 36;
1721f1af5d2fSBarry Smith     }
1722f1af5d2fSBarry Smith   }
1723f1af5d2fSBarry Smith 
1724f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1725f1af5d2fSBarry Smith   ii = 0;
1726f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1727f1af5d2fSBarry Smith     ir      = 6*r[i];
1728f1af5d2fSBarry Smith     x[ir]   = t[ii];
1729f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1730f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1731f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1732f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1733f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1734f1af5d2fSBarry Smith     ii += 6;
1735f1af5d2fSBarry Smith   }
1736f1af5d2fSBarry Smith 
1737f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1738f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
17391ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
17401ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1741dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1742f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1743f1af5d2fSBarry Smith }
1744f1af5d2fSBarry Smith 
17454a2ae208SSatish Balay #undef __FUNCT__
174632121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_newdatastruct"
174732121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
174832121132SShri Abhyankar {
174932121132SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
175032121132SShri Abhyankar   PetscErrorCode ierr;
175132121132SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
175232121132SShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
175332121132SShri Abhyankar   const PetscInt *r,*c,*rout,*cout;
175432121132SShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
175532121132SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
175632121132SShri Abhyankar   MatScalar      *aa=a->a,*v;
175732121132SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
175832121132SShri Abhyankar   PetscScalar    *x,*b,*t;
175932121132SShri Abhyankar 
176032121132SShri Abhyankar   PetscFunctionBegin;
176132121132SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
176232121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
176332121132SShri Abhyankar   t = a->solve_work;
176432121132SShri Abhyankar 
176532121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
176632121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
176732121132SShri Abhyankar 
176832121132SShri Abhyankar   /* copy b into temp work space according to permutation */
176932121132SShri Abhyankar   for(i=0;i<n;i++){
177032121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
177132121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
177232121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
177332121132SShri Abhyankar   }
177432121132SShri Abhyankar 
177532121132SShri Abhyankar   /* forward solve the U^T */
177632121132SShri Abhyankar   idx = 0;
177732121132SShri Abhyankar   for (i=0; i<n; i++) {
177832121132SShri Abhyankar     v     = aa + bs2*diag[i];
177932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
178032121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
178132121132SShri Abhyankar     x6    = t[5+idx];
178232121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
178332121132SShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
178432121132SShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
178532121132SShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
178632121132SShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
178732121132SShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
178832121132SShri Abhyankar     v -= bs2;
178932121132SShri Abhyankar 
179032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
179132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
179232121132SShri Abhyankar     for(j=0;j>-nz;j--){
179332121132SShri Abhyankar       oidx = bs*vi[j];
179432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
179532121132SShri Abhyankar       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
179632121132SShri Abhyankar       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
179732121132SShri Abhyankar       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
179832121132SShri Abhyankar       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
179932121132SShri Abhyankar       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
180032121132SShri Abhyankar       v  -= bs2;
180132121132SShri Abhyankar     }
180232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
180332121132SShri Abhyankar     t[5+idx] = s6;
180432121132SShri Abhyankar     idx += bs;
180532121132SShri Abhyankar   }
180632121132SShri Abhyankar   /* backward solve the L^T */
180732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
180832121132SShri Abhyankar     v    = aa + bs2*ai[i];
180932121132SShri Abhyankar     vi   = aj + ai[i];
181032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
181132121132SShri Abhyankar     idt  = bs*i;
181232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
181332121132SShri Abhyankar     s6   = t[5+idt];
181432121132SShri Abhyankar    for(j=0;j<nz;j++){
181532121132SShri Abhyankar       idx   = bs*vi[j];
181632121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
181732121132SShri Abhyankar       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
181832121132SShri Abhyankar       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
181932121132SShri Abhyankar       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
182032121132SShri Abhyankar       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
182132121132SShri Abhyankar       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
182232121132SShri Abhyankar       v += bs2;
182332121132SShri Abhyankar     }
182432121132SShri Abhyankar   }
182532121132SShri Abhyankar 
182632121132SShri Abhyankar   /* copy t into x according to permutation */
182732121132SShri Abhyankar   for(i=0;i<n;i++){
182832121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
182932121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
183032121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
183132121132SShri Abhyankar   }
183232121132SShri Abhyankar 
183332121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
183432121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
183532121132SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
183632121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
183732121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
183832121132SShri Abhyankar   PetscFunctionReturn(0);
183932121132SShri Abhyankar }
184032121132SShri Abhyankar 
184132121132SShri Abhyankar #undef __FUNCT__
1842*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
1843*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1844f1af5d2fSBarry Smith {
1845f1af5d2fSBarry Smith   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
1846f1af5d2fSBarry Smith   IS             iscol=a->col,isrow=a->row;
18476849ba73SBarry Smith   PetscErrorCode ierr;
18485d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout;
18495d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
1850690b6cddSBarry Smith   PetscInt       *diag = a->diag,ii,ic,ir,oidx;
1851f1af5d2fSBarry Smith   MatScalar      *aa=a->a,*v;
185287828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
185387828ca2SBarry Smith   PetscScalar    *x,*b,*t;
1854f1af5d2fSBarry Smith 
1855f1af5d2fSBarry Smith   PetscFunctionBegin;
18561ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
18571ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1858f1af5d2fSBarry Smith   t  = a->solve_work;
1859f1af5d2fSBarry Smith 
1860f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1861f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1862f1af5d2fSBarry Smith 
1863f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1864f1af5d2fSBarry Smith   ii = 0;
1865f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1866f1af5d2fSBarry Smith     ic      = 7*c[i];
1867f1af5d2fSBarry Smith     t[ii]   = b[ic];
1868f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1869f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1870f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1871f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1872f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1873f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1874f1af5d2fSBarry Smith     ii += 7;
1875f1af5d2fSBarry Smith   }
1876f1af5d2fSBarry Smith 
1877f1af5d2fSBarry Smith   /* forward solve the U^T */
1878f1af5d2fSBarry Smith   idx = 0;
1879f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1880f1af5d2fSBarry Smith 
1881f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1882f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1883f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1884f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1885f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1886f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1887f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1888f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1889f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1890f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1891f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1892f1af5d2fSBarry Smith     v += 49;
1893f1af5d2fSBarry Smith 
1894f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1895f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1896f1af5d2fSBarry Smith     while (nz--) {
1897f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1898f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1899f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1900f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1901f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1902f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1903f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1904f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1905f1af5d2fSBarry Smith       v  += 49;
1906f1af5d2fSBarry Smith     }
1907f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1908f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1909f1af5d2fSBarry Smith     idx += 7;
1910f1af5d2fSBarry Smith   }
1911f1af5d2fSBarry Smith   /* backward solve the L^T */
1912f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1913f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1914f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1915f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1916f1af5d2fSBarry Smith     idt  = 7*i;
1917f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1918f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1919f1af5d2fSBarry Smith     while (nz--) {
1920f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1921f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1922f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1923f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1924f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1925f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1926f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1927f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1928f1af5d2fSBarry Smith       v -= 49;
1929f1af5d2fSBarry Smith     }
1930f1af5d2fSBarry Smith   }
1931f1af5d2fSBarry Smith 
1932f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1933f1af5d2fSBarry Smith   ii = 0;
1934f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1935f1af5d2fSBarry Smith     ir      = 7*r[i];
1936f1af5d2fSBarry Smith     x[ir]   = t[ii];
1937f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1938f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1939f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1940f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1941f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1942f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1943f1af5d2fSBarry Smith     ii += 7;
1944f1af5d2fSBarry Smith   }
1945f1af5d2fSBarry Smith 
1946f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1947f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
19481ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
19491ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1950dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1951f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1952f1af5d2fSBarry Smith }
195332121132SShri Abhyankar #undef __FUNCT__
195432121132SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_newdatastruct"
195532121132SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
195632121132SShri Abhyankar {
195732121132SShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
195832121132SShri Abhyankar   PetscErrorCode ierr;
195932121132SShri Abhyankar   IS             iscol=a->col,isrow=a->row;
196032121132SShri Abhyankar   PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
196132121132SShri Abhyankar   const PetscInt *r,*c,*rout,*cout;
196232121132SShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx,ii,ic,ir;
196332121132SShri Abhyankar   PetscInt       bs=A->rmap->bs,bs2=a->bs2;
196432121132SShri Abhyankar   MatScalar      *aa=a->a,*v;
196532121132SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
196632121132SShri Abhyankar   PetscScalar    *x,*b,*t;
196732121132SShri Abhyankar 
196832121132SShri Abhyankar   PetscFunctionBegin;
196932121132SShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
197032121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
197132121132SShri Abhyankar   t = a->solve_work;
197232121132SShri Abhyankar 
197332121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
197432121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
197532121132SShri Abhyankar 
197632121132SShri Abhyankar   /* copy b into temp work space according to permutation */
197732121132SShri Abhyankar   for(i=0;i<n;i++){
197832121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
197932121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
198032121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
198132121132SShri Abhyankar   }
198232121132SShri Abhyankar 
198332121132SShri Abhyankar   /* forward solve the U^T */
198432121132SShri Abhyankar   idx = 0;
198532121132SShri Abhyankar   for (i=0; i<n; i++) {
198632121132SShri Abhyankar     v     = aa + bs2*diag[i];
198732121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
198832121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
198932121132SShri Abhyankar     x6    = t[5+idx]; x7 = t[6+idx];
199032121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
199132121132SShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
199232121132SShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
199332121132SShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
199432121132SShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
199532121132SShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
199632121132SShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
199732121132SShri Abhyankar     v -= bs2;
199832121132SShri Abhyankar 
199932121132SShri Abhyankar     vi    = aj + diag[i] - 1;
200032121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
200132121132SShri Abhyankar     for(j=0;j>-nz;j--){
200232121132SShri Abhyankar       oidx = bs*vi[j];
200332121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
200432121132SShri Abhyankar       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
200532121132SShri Abhyankar       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
200632121132SShri Abhyankar       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
200732121132SShri Abhyankar       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
200832121132SShri Abhyankar       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
200932121132SShri Abhyankar       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
201032121132SShri Abhyankar       v  -= bs2;
201132121132SShri Abhyankar     }
201232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
201332121132SShri Abhyankar     t[5+idx] = s6;  t[6+idx] = s7;
201432121132SShri Abhyankar     idx += bs;
201532121132SShri Abhyankar   }
201632121132SShri Abhyankar   /* backward solve the L^T */
201732121132SShri Abhyankar   for (i=n-1; i>=0; i--){
201832121132SShri Abhyankar     v    = aa + bs2*ai[i];
201932121132SShri Abhyankar     vi   = aj + ai[i];
202032121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
202132121132SShri Abhyankar     idt  = bs*i;
202232121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
202332121132SShri Abhyankar     s6   = t[5+idt];  s7 = t[6+idt];
202432121132SShri Abhyankar    for(j=0;j<nz;j++){
202532121132SShri Abhyankar       idx   = bs*vi[j];
202632121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
202732121132SShri Abhyankar       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
202832121132SShri Abhyankar       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
202932121132SShri Abhyankar       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
203032121132SShri Abhyankar       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
203132121132SShri Abhyankar       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
203232121132SShri Abhyankar       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
203332121132SShri Abhyankar       v += bs2;
203432121132SShri Abhyankar     }
203532121132SShri Abhyankar   }
203632121132SShri Abhyankar 
203732121132SShri Abhyankar   /* copy t into x according to permutation */
203832121132SShri Abhyankar   for(i=0;i<n;i++){
203932121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
204032121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
204132121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
204232121132SShri Abhyankar   }
204332121132SShri Abhyankar 
204432121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
204532121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
204632121132SShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
204732121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
204832121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
204932121132SShri Abhyankar   PetscFunctionReturn(0);
205032121132SShri Abhyankar }
2051f1af5d2fSBarry Smith 
20524e2b4712SSatish Balay /* ----------------------------------------------------------- */
20534a2ae208SSatish Balay #undef __FUNCT__
2054*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
2055*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
20564e2b4712SSatish Balay {
20574e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
20584e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
20596849ba73SBarry Smith   PetscErrorCode ierr;
20605d0c19d7SBarry Smith   const PetscInt *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
20615d0c19d7SBarry Smith   PetscInt       i,n=a->mbs;
20625d0c19d7SBarry Smith   PetscInt       nz,bs=A->rmap->bs,bs2=a->bs2;
20633f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
206487828ca2SBarry Smith   PetscScalar    *x,*b,*s,*t,*ls;
20654e2b4712SSatish Balay 
20664e2b4712SSatish Balay   PetscFunctionBegin;
20671ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
20681ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2069f1af5d2fSBarry Smith   t  = a->solve_work;
20704e2b4712SSatish Balay 
20714e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
20724e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
20734e2b4712SSatish Balay 
20744e2b4712SSatish Balay   /* forward solve the lower triangular */
207587828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
20764e2b4712SSatish Balay   for (i=1; i<n; i++) {
20774e2b4712SSatish Balay     v   = aa + bs2*ai[i];
20784e2b4712SSatish Balay     vi  = aj + ai[i];
20794e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
2080f1af5d2fSBarry Smith     s = t + bs*i;
208187828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
20824e2b4712SSatish Balay     while (nz--) {
2083f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
20844e2b4712SSatish Balay       v += bs2;
20854e2b4712SSatish Balay     }
20864e2b4712SSatish Balay   }
20874e2b4712SSatish Balay   /* backward solve the upper triangular */
2088d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
20894e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
20904e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
20914e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
20924e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
209387828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
20944e2b4712SSatish Balay     while (nz--) {
2095f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
20964e2b4712SSatish Balay       v += bs2;
20974e2b4712SSatish Balay     }
2098f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
209987828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21004e2b4712SSatish Balay   }
21014e2b4712SSatish Balay 
21024e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21034e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21041ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
21051ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2106dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
21074e2b4712SSatish Balay   PetscFunctionReturn(0);
21084e2b4712SSatish Balay }
21094e2b4712SSatish Balay 
21105c42ef9dSBarry Smith /* ----------------------------------------------------------- */
21115c42ef9dSBarry Smith #undef __FUNCT__
2112*06e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
2113*06e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21145c42ef9dSBarry Smith {
21155c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21165c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
21175c42ef9dSBarry Smith   PetscErrorCode    ierr;
21185c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
21195c42ef9dSBarry Smith   PetscInt          i,n=a->mbs,j;
21205c42ef9dSBarry Smith   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
21215c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
21225c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
21235c42ef9dSBarry Smith   const PetscScalar *b;
21245c42ef9dSBarry Smith   PetscFunctionBegin;
21255c42ef9dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21265c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21275c42ef9dSBarry Smith   t    = a->solve_work;
21285c42ef9dSBarry Smith 
21295c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21305c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
21315c42ef9dSBarry Smith 
21325c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
21335c42ef9dSBarry Smith   for (i=0; i<n; i++) {
21345c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
21355c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
21365c42ef9dSBarry Smith     }
21375c42ef9dSBarry Smith   }
21385c42ef9dSBarry Smith 
21395c42ef9dSBarry Smith 
21405c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
21415c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
21425c42ef9dSBarry Smith   for (i=0; i<n; i++){
21435c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21445c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
21455c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
21465c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
21475c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
21485c42ef9dSBarry Smith     while (nz--) {
21495c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
21505c42ef9dSBarry Smith       v += bs2;
21515c42ef9dSBarry Smith     }
21525c42ef9dSBarry Smith   }
21535c42ef9dSBarry Smith 
21545c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
21555c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
21565c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
21575c42ef9dSBarry Smith     vi  = aj + ai[i];
21585c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
21595c42ef9dSBarry Smith     while (nz--) {
21605c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
21615c42ef9dSBarry Smith       v += bs2;
21625c42ef9dSBarry Smith     }
21635c42ef9dSBarry Smith   }
21645c42ef9dSBarry Smith 
21655c42ef9dSBarry Smith   /* copy t into x according to permutation */
21665c42ef9dSBarry Smith   for (i=0; i<n; i++) {
21675c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
21685c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
21695c42ef9dSBarry Smith     }
21705c42ef9dSBarry Smith   }
21715c42ef9dSBarry Smith 
21725c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21735c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21745c42ef9dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21755c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
21765c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
21775c42ef9dSBarry Smith   PetscFunctionReturn(0);
21785c42ef9dSBarry Smith }
21795c42ef9dSBarry Smith 
21804a2ae208SSatish Balay #undef __FUNCT__
21818499736aSShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_newdatastruct"
21828499736aSShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N_newdatastruct(Mat A,Vec bb,Vec xx)
21838499736aSShri Abhyankar {
21848499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21858499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
21868499736aSShri Abhyankar   PetscErrorCode    ierr;
21878499736aSShri Abhyankar   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
21888499736aSShri Abhyankar   PetscInt          i,n=a->mbs,j;
21898499736aSShri Abhyankar   PetscInt          nz,bs=A->rmap->bs,bs2=a->bs2;
21908499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
21918499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
21928499736aSShri Abhyankar   const PetscScalar *b;
21938499736aSShri Abhyankar   PetscFunctionBegin;
21948499736aSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21958499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21968499736aSShri Abhyankar   t    = a->solve_work;
21978499736aSShri Abhyankar 
21988499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21998499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22008499736aSShri Abhyankar 
22018499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
22028499736aSShri Abhyankar   for (i=0; i<n; i++) {
22038499736aSShri Abhyankar     for (j=0; j<bs; j++) {
22048499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
22058499736aSShri Abhyankar     }
22068499736aSShri Abhyankar   }
22078499736aSShri Abhyankar 
22088499736aSShri Abhyankar 
22098499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
22108499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
22118499736aSShri Abhyankar   for (i=0; i<n; i++){
22128499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22138499736aSShri Abhyankar     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
22148499736aSShri Abhyankar     v   = aa + bs2*(diag[i] - 1);
22158499736aSShri Abhyankar     vi  = aj + diag[i] - 1;
22168499736aSShri Abhyankar     nz  = diag[i] - diag[i+1] - 1;
22178499736aSShri Abhyankar     for(j=0;j>-nz;j--){
22188499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
22198499736aSShri Abhyankar       v -= bs2;
22208499736aSShri Abhyankar     }
22218499736aSShri Abhyankar   }
22228499736aSShri Abhyankar 
22238499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
22248499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
22258499736aSShri Abhyankar     v   = aa + bs2*ai[i];
22268499736aSShri Abhyankar     vi  = aj + ai[i];
22278499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
22288499736aSShri Abhyankar     for(j=0;j<nz;j++){
22298499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
22308499736aSShri Abhyankar       v += bs2;
22318499736aSShri Abhyankar     }
22328499736aSShri Abhyankar   }
22338499736aSShri Abhyankar 
22348499736aSShri Abhyankar   /* copy t into x according to permutation */
22358499736aSShri Abhyankar   for (i=0; i<n; i++) {
22368499736aSShri Abhyankar     for (j=0; j<bs; j++) {
22378499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
22388499736aSShri Abhyankar     }
22398499736aSShri Abhyankar   }
22408499736aSShri Abhyankar 
22418499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22428499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22438499736aSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22448499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22458499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22468499736aSShri Abhyankar   PetscFunctionReturn(0);
22478499736aSShri Abhyankar }
22488499736aSShri Abhyankar 
22498499736aSShri Abhyankar #undef __FUNCT__
2250*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
2251*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
22524e2b4712SSatish Balay {
22534e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
22544e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
22556849ba73SBarry Smith   PetscErrorCode ierr;
22565d0c19d7SBarry Smith   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*rout,*cout,*diag = a->diag,*vi;
22575d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,nz,idx,idt,idc;
22583f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
225987828ca2SBarry Smith   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
226087828ca2SBarry Smith   PetscScalar    *x,*b,*t;
22614e2b4712SSatish Balay 
22624e2b4712SSatish Balay   PetscFunctionBegin;
22631ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
22641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2265f1af5d2fSBarry Smith   t  = a->solve_work;
22664e2b4712SSatish Balay 
22674e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22684e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
22694e2b4712SSatish Balay 
22704e2b4712SSatish Balay   /* forward solve the lower triangular */
22714e2b4712SSatish Balay   idx    = 7*(*r++);
2272f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2273f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2274f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
22754e2b4712SSatish Balay 
22764e2b4712SSatish Balay   for (i=1; i<n; i++) {
22774e2b4712SSatish Balay     v     = aa + 49*ai[i];
22784e2b4712SSatish Balay     vi    = aj + ai[i];
22794e2b4712SSatish Balay     nz    = diag[i] - ai[i];
22804e2b4712SSatish Balay     idx   = 7*(*r++);
2281f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2282f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
22834e2b4712SSatish Balay     while (nz--) {
22844e2b4712SSatish Balay       idx   = 7*(*vi++);
2285f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2286f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2287f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
2288f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2289f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2290f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2291f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2292f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2293f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2294f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
22954e2b4712SSatish Balay       v += 49;
22964e2b4712SSatish Balay     }
22974e2b4712SSatish Balay     idx = 7*i;
2298f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2299f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2300f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
23014e2b4712SSatish Balay   }
23024e2b4712SSatish Balay   /* backward solve the upper triangular */
23034e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
23044e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
23054e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
23064e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
23074e2b4712SSatish Balay     idt  = 7*i;
2308f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2309f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2310f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
23114e2b4712SSatish Balay     while (nz--) {
23124e2b4712SSatish Balay       idx   = 7*(*vi++);
2313f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2314f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2315f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
2316f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2317f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2318f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2319f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2320f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2321f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2322f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
23234e2b4712SSatish Balay       v += 49;
23244e2b4712SSatish Balay     }
23254e2b4712SSatish Balay     idc = 7*(*c--);
23264e2b4712SSatish Balay     v   = aa + 49*diag[i];
2327f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2328f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2329f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2330f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2331f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2332f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2333f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2334f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2335f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2336f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2337f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2338f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2339f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2340f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
23414e2b4712SSatish Balay   }
23424e2b4712SSatish Balay 
23434e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
23444e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
23451ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
23461ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2347dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
23484e2b4712SSatish Balay   PetscFunctionReturn(0);
23494e2b4712SSatish Balay }
23504e2b4712SSatish Balay 
23518f690400SShri Abhyankar #undef __FUNCT__
2352a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_newdatastruct"
2353a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_newdatastruct(Mat A,Vec bb,Vec xx)
235435aa4fcfSShri Abhyankar {
235535aa4fcfSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
235635aa4fcfSShri Abhyankar   IS             iscol=a->col,isrow=a->row;
235735aa4fcfSShri Abhyankar   PetscErrorCode ierr;
235835aa4fcfSShri Abhyankar   const PetscInt *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag,*rout,*cout,*vi;
235935aa4fcfSShri Abhyankar   PetscInt       i,n=a->mbs,nz,idx,idt,idc,m;
236035aa4fcfSShri Abhyankar   MatScalar      *aa=a->a,*v;
236135aa4fcfSShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
236235aa4fcfSShri Abhyankar   PetscScalar    *x,*b,*t;
236335aa4fcfSShri Abhyankar 
236435aa4fcfSShri Abhyankar   PetscFunctionBegin;
236535aa4fcfSShri Abhyankar   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
236635aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
236735aa4fcfSShri Abhyankar   t  = a->solve_work;
236835aa4fcfSShri Abhyankar 
236935aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
237035aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
237135aa4fcfSShri Abhyankar 
237235aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
237335aa4fcfSShri Abhyankar   idx    = 7*r[0];
237435aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
237535aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
237635aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
237735aa4fcfSShri Abhyankar 
237835aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
237935aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
238035aa4fcfSShri Abhyankar     vi    = aj + ai[i];
238135aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
238235aa4fcfSShri Abhyankar     idx   = 7*r[i];
238335aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
238435aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
238535aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
238635aa4fcfSShri Abhyankar       idx   = 7*vi[m];
238735aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
238835aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
238935aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
239035aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
239135aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
239235aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
239335aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
239435aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
239535aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
239635aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
239735aa4fcfSShri Abhyankar       v += 49;
239835aa4fcfSShri Abhyankar     }
239935aa4fcfSShri Abhyankar     idx = 7*i;
240035aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
240135aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
240235aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
240335aa4fcfSShri Abhyankar   }
240435aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
240535aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
240635aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
240735aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
240835aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
240935aa4fcfSShri Abhyankar     idt  = 7*i;
241035aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
241135aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
241235aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
241335aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
241435aa4fcfSShri Abhyankar       idx   = 7*vi[m];
241535aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
241635aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
241735aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
241835aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
241935aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
242035aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
242135aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
242235aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
242335aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
242435aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
242535aa4fcfSShri Abhyankar       v += 49;
242635aa4fcfSShri Abhyankar     }
242735aa4fcfSShri Abhyankar     idc = 7*c[i];
242835aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
242935aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
243035aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
243135aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
243235aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
243335aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
243435aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
243535aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
243635aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
243735aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
243835aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
243935aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
244035aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
244135aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
244235aa4fcfSShri Abhyankar   }
244335aa4fcfSShri Abhyankar 
244435aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
244535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
244635aa4fcfSShri Abhyankar   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
244735aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
244835aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
244935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
245035aa4fcfSShri Abhyankar }
245135aa4fcfSShri Abhyankar 
245235aa4fcfSShri Abhyankar #undef __FUNCT__
2453*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
2454*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
245515091d37SBarry Smith {
245615091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2457690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2458dfbe8321SBarry Smith   PetscErrorCode    ierr;
2459690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2460d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2461d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2462d9fead3dSBarry Smith   const PetscScalar *b;
246315091d37SBarry Smith 
246415091d37SBarry Smith   PetscFunctionBegin;
2465d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24661ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
246715091d37SBarry Smith   /* forward solve the lower triangular */
246815091d37SBarry Smith   idx    = 0;
246915091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
247015091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
247115091d37SBarry Smith   x[6] = b[6+idx];
247215091d37SBarry Smith   for (i=1; i<n; i++) {
247315091d37SBarry Smith     v     =  aa + 49*ai[i];
247415091d37SBarry Smith     vi    =  aj + ai[i];
247515091d37SBarry Smith     nz    =  diag[i] - ai[i];
247615091d37SBarry Smith     idx   =  7*i;
2477f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2478f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2479f1af5d2fSBarry Smith     s7  =  b[6+idx];
248015091d37SBarry Smith     while (nz--) {
248115091d37SBarry Smith       jdx   = 7*(*vi++);
248215091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
248315091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
248415091d37SBarry Smith       x7    = x[6+jdx];
2485f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2486f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2487f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2488f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2489f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2490f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2491f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
249215091d37SBarry Smith       v += 49;
249315091d37SBarry Smith      }
2494f1af5d2fSBarry Smith     x[idx]   = s1;
2495f1af5d2fSBarry Smith     x[1+idx] = s2;
2496f1af5d2fSBarry Smith     x[2+idx] = s3;
2497f1af5d2fSBarry Smith     x[3+idx] = s4;
2498f1af5d2fSBarry Smith     x[4+idx] = s5;
2499f1af5d2fSBarry Smith     x[5+idx] = s6;
2500f1af5d2fSBarry Smith     x[6+idx] = s7;
250115091d37SBarry Smith   }
250215091d37SBarry Smith   /* backward solve the upper triangular */
250315091d37SBarry Smith   for (i=n-1; i>=0; i--){
250415091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
250515091d37SBarry Smith     vi   = aj + diag[i] + 1;
250615091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
250715091d37SBarry Smith     idt  = 7*i;
2508f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2509f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2510f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
2511f1af5d2fSBarry Smith     s7 = x[6+idt];
251215091d37SBarry Smith     while (nz--) {
251315091d37SBarry Smith       idx   = 7*(*vi++);
251415091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
251515091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
251615091d37SBarry Smith       x7    = x[6+idx];
2517f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2518f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2519f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2520f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2521f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2522f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2523f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
252415091d37SBarry Smith       v += 49;
252515091d37SBarry Smith     }
252615091d37SBarry Smith     v        = aa + 49*diag[i];
2527f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2528f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2529f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2530f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2531f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2532f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2533f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2534f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2535f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2536f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2537f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2538f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2539f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2540f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
254115091d37SBarry Smith   }
254215091d37SBarry Smith 
2543d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2545dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
254615091d37SBarry Smith   PetscFunctionReturn(0);
254715091d37SBarry Smith }
254815091d37SBarry Smith 
2549cee9d6f2SShri Abhyankar #undef __FUNCT__
2550a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct"
2551a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
255253cca76cSShri Abhyankar {
255353cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
255453cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
255553cca76cSShri Abhyankar     PetscErrorCode    ierr;
255653cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
255753cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
255853cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
255953cca76cSShri Abhyankar     PetscScalar       *x;
256053cca76cSShri Abhyankar     const PetscScalar *b;
256153cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
256253cca76cSShri Abhyankar 
256353cca76cSShri Abhyankar     PetscFunctionBegin;
256453cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
256553cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
256653cca76cSShri Abhyankar     /* forward solve the lower triangular */
256753cca76cSShri Abhyankar     idx    = 0;
256853cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
256953cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
257053cca76cSShri Abhyankar     for (i=1; i<n; i++) {
257153cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
257253cca76cSShri Abhyankar        vi   = aj + ai[i];
257353cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
257453cca76cSShri Abhyankar       idx   = bs*i;
257553cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
257653cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
257753cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
257853cca76cSShri Abhyankar           jdx   = bs*vi[k];
257953cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
258053cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
258153cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
258253cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
258353cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
258453cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
258553cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
258653cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
258753cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
258853cca76cSShri Abhyankar           v   +=  bs2;
258953cca76cSShri Abhyankar         }
259053cca76cSShri Abhyankar 
259153cca76cSShri Abhyankar        x[idx]   = s1;
259253cca76cSShri Abhyankar        x[1+idx] = s2;
259353cca76cSShri Abhyankar        x[2+idx] = s3;
259453cca76cSShri Abhyankar        x[3+idx] = s4;
259553cca76cSShri Abhyankar        x[4+idx] = s5;
259653cca76cSShri Abhyankar        x[5+idx] = s6;
259753cca76cSShri Abhyankar        x[6+idx] = s7;
259853cca76cSShri Abhyankar     }
259953cca76cSShri Abhyankar 
260053cca76cSShri Abhyankar    /* backward solve the upper triangular */
260153cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
260253cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
260353cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
260453cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
260553cca76cSShri Abhyankar      idt = bs*i;
260653cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
260753cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
260853cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
260953cca76cSShri Abhyankar       idx   = bs*vi[k];
261053cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
261153cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
261253cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
261353cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
261453cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
261553cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
261653cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
261753cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
261853cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
261953cca76cSShri Abhyankar         v   +=  bs2;
262053cca76cSShri Abhyankar     }
262153cca76cSShri Abhyankar     /* x = inv_diagonal*x */
262253cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
262353cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
262453cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
262553cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
262653cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
262753cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
262853cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
262953cca76cSShri Abhyankar   }
263053cca76cSShri Abhyankar 
263153cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
263253cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
263353cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
263453cca76cSShri Abhyankar   PetscFunctionReturn(0);
263553cca76cSShri Abhyankar }
263653cca76cSShri Abhyankar 
263753cca76cSShri Abhyankar #undef __FUNCT__
2638*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
2639*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
264015091d37SBarry Smith {
264115091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
264215091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
26436849ba73SBarry Smith   PetscErrorCode    ierr;
26445d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
26455d0c19d7SBarry Smith   PetscInt          *diag = a->diag,i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
2646d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2647d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2648d9fead3dSBarry Smith   const PetscScalar *b;
264915091d37SBarry Smith   PetscFunctionBegin;
2650d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
26511ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2652f1af5d2fSBarry Smith   t  = a->solve_work;
265315091d37SBarry Smith 
265415091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
265515091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
265615091d37SBarry Smith 
265715091d37SBarry Smith   /* forward solve the lower triangular */
265815091d37SBarry Smith   idx    = 6*(*r++);
2659f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2660f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
2661f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
266215091d37SBarry Smith   for (i=1; i<n; i++) {
266315091d37SBarry Smith     v     = aa + 36*ai[i];
266415091d37SBarry Smith     vi    = aj + ai[i];
266515091d37SBarry Smith     nz    = diag[i] - ai[i];
266615091d37SBarry Smith     idx   = 6*(*r++);
2667f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2668f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
266915091d37SBarry Smith     while (nz--) {
267015091d37SBarry Smith       idx   = 6*(*vi++);
2671f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2672f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2673f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2674f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2675f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2676f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2677f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2678f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
267915091d37SBarry Smith       v += 36;
268015091d37SBarry Smith     }
268115091d37SBarry Smith     idx = 6*i;
2682f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2683f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
2684f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
268515091d37SBarry Smith   }
268615091d37SBarry Smith   /* backward solve the upper triangular */
268715091d37SBarry Smith   for (i=n-1; i>=0; i--){
268815091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
268915091d37SBarry Smith     vi   = aj + diag[i] + 1;
269015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
269115091d37SBarry Smith     idt  = 6*i;
2692f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2693f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
2694f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
269515091d37SBarry Smith     while (nz--) {
269615091d37SBarry Smith       idx   = 6*(*vi++);
2697f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2698f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2699f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
2700f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2701f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2702f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2703f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2704f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2705f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
270615091d37SBarry Smith       v += 36;
270715091d37SBarry Smith     }
270815091d37SBarry Smith     idc = 6*(*c--);
270915091d37SBarry Smith     v   = aa + 36*diag[i];
2710f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2711f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
2712f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2713f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
2714f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2715f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
2716f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2717f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
2718f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2719f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
2720f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2721f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
272215091d37SBarry Smith   }
272315091d37SBarry Smith 
272415091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
272515091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2726d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27271ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2728dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
272915091d37SBarry Smith   PetscFunctionReturn(0);
273015091d37SBarry Smith }
273115091d37SBarry Smith 
27326506fda5SShri Abhyankar #undef __FUNCT__
2733a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_newdatastruct"
2734a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_newdatastruct(Mat A,Vec bb,Vec xx)
27356506fda5SShri Abhyankar {
27366506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
27376506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
27386506fda5SShri Abhyankar   PetscErrorCode    ierr;
27396506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
27406506fda5SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
27416506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
27426506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
27436506fda5SShri Abhyankar   const PetscScalar *b;
27446506fda5SShri Abhyankar   PetscFunctionBegin;
27456506fda5SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27466506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
27476506fda5SShri Abhyankar   t  = a->solve_work;
27486506fda5SShri Abhyankar 
27496506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
27506506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
27516506fda5SShri Abhyankar 
27526506fda5SShri Abhyankar   /* forward solve the lower triangular */
27536506fda5SShri Abhyankar   idx    = 6*r[0];
27546506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
27556506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
27566506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
27576506fda5SShri Abhyankar   for (i=1; i<n; i++) {
27586506fda5SShri Abhyankar     v     = aa + 36*ai[i];
27596506fda5SShri Abhyankar     vi    = aj + ai[i];
27606506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
27616506fda5SShri Abhyankar     idx   = 6*r[i];
27626506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
27636506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
27646506fda5SShri Abhyankar     for(m=0;m<nz;m++){
27656506fda5SShri Abhyankar       idx   = 6*vi[m];
27666506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
27676506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
27686506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
27696506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
27706506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
27716506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
27726506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
27736506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
27746506fda5SShri Abhyankar       v += 36;
27756506fda5SShri Abhyankar     }
27766506fda5SShri Abhyankar     idx = 6*i;
27776506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
27786506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
27796506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
27806506fda5SShri Abhyankar   }
27816506fda5SShri Abhyankar   /* backward solve the upper triangular */
27826506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
27836506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
27846506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
27856506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
27866506fda5SShri Abhyankar     idt  = 6*i;
27876506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
27886506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
27896506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
27906506fda5SShri Abhyankar     for(m=0;m<nz;m++){
27916506fda5SShri Abhyankar       idx   = 6*vi[m];
27926506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
27936506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
27946506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
27956506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
27966506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
27976506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
27986506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
27996506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
28006506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
28016506fda5SShri Abhyankar       v += 36;
28026506fda5SShri Abhyankar     }
28036506fda5SShri Abhyankar     idc = 6*c[i];
28046506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
28056506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
28066506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
28076506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
28086506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
28096506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
28106506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
28116506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
28126506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
28136506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
28146506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
28156506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
28166506fda5SShri Abhyankar   }
28176506fda5SShri Abhyankar 
28186506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
28196506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
28206506fda5SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28216506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
28226506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
28236506fda5SShri Abhyankar   PetscFunctionReturn(0);
28246506fda5SShri Abhyankar }
28258f690400SShri Abhyankar 
28268f690400SShri Abhyankar #undef __FUNCT__
2827*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
2828*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
282915091d37SBarry Smith {
283015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2831690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
2832dfbe8321SBarry Smith   PetscErrorCode    ierr;
2833690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
2834d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2835d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
2836d9fead3dSBarry Smith   const PetscScalar *b;
283715091d37SBarry Smith 
283815091d37SBarry Smith   PetscFunctionBegin;
2839d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28401ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
284115091d37SBarry Smith   /* forward solve the lower triangular */
284215091d37SBarry Smith   idx    = 0;
284315091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
284415091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
284515091d37SBarry Smith   for (i=1; i<n; i++) {
284615091d37SBarry Smith     v     =  aa + 36*ai[i];
284715091d37SBarry Smith     vi    =  aj + ai[i];
284815091d37SBarry Smith     nz    =  diag[i] - ai[i];
284915091d37SBarry Smith     idx   =  6*i;
2850f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2851f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
285215091d37SBarry Smith     while (nz--) {
285315091d37SBarry Smith       jdx   = 6*(*vi++);
285415091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
285515091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
2856f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2857f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2858f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2859f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2860f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2861f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
286215091d37SBarry Smith       v += 36;
286315091d37SBarry Smith      }
2864f1af5d2fSBarry Smith     x[idx]   = s1;
2865f1af5d2fSBarry Smith     x[1+idx] = s2;
2866f1af5d2fSBarry Smith     x[2+idx] = s3;
2867f1af5d2fSBarry Smith     x[3+idx] = s4;
2868f1af5d2fSBarry Smith     x[4+idx] = s5;
2869f1af5d2fSBarry Smith     x[5+idx] = s6;
287015091d37SBarry Smith   }
287115091d37SBarry Smith   /* backward solve the upper triangular */
287215091d37SBarry Smith   for (i=n-1; i>=0; i--){
287315091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
287415091d37SBarry Smith     vi   = aj + diag[i] + 1;
287515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
287615091d37SBarry Smith     idt  = 6*i;
2877f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2878f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2879f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
288015091d37SBarry Smith     while (nz--) {
288115091d37SBarry Smith       idx   = 6*(*vi++);
288215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
288315091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
2884f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2885f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2886f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2887f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2888f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2889f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
289015091d37SBarry Smith       v += 36;
289115091d37SBarry Smith     }
289215091d37SBarry Smith     v        = aa + 36*diag[i];
2893f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
2894f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
2895f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
2896f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
2897f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
2898f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
289915091d37SBarry Smith   }
290015091d37SBarry Smith 
2901d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29021ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2903dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
290415091d37SBarry Smith   PetscFunctionReturn(0);
290515091d37SBarry Smith }
290615091d37SBarry Smith 
2907cee9d6f2SShri Abhyankar #undef __FUNCT__
2908a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct"
2909a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
291053cca76cSShri Abhyankar {
291153cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
291253cca76cSShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
291353cca76cSShri Abhyankar     PetscErrorCode    ierr;
291453cca76cSShri Abhyankar     PetscInt          idx,jdx,idt;
291553cca76cSShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
291653cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
291753cca76cSShri Abhyankar     PetscScalar       *x;
291853cca76cSShri Abhyankar     const PetscScalar *b;
291953cca76cSShri Abhyankar     PetscScalar        s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
292053cca76cSShri Abhyankar 
292153cca76cSShri Abhyankar     PetscFunctionBegin;
292253cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
292353cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
292453cca76cSShri Abhyankar     /* forward solve the lower triangular */
292553cca76cSShri Abhyankar     idx    = 0;
292653cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
292753cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
292853cca76cSShri Abhyankar     for (i=1; i<n; i++) {
292953cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
293053cca76cSShri Abhyankar        vi   = aj + ai[i];
293153cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
293253cca76cSShri Abhyankar       idx   = bs*i;
293353cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
293453cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
293553cca76cSShri Abhyankar        for(k=0;k<nz;k++){
293653cca76cSShri Abhyankar           jdx   = bs*vi[k];
293753cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
293853cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
293953cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
294053cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
294153cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
294253cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
294353cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
294453cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
294553cca76cSShri Abhyankar           v   +=  bs2;
294653cca76cSShri Abhyankar         }
294753cca76cSShri Abhyankar 
294853cca76cSShri Abhyankar        x[idx]   = s1;
294953cca76cSShri Abhyankar        x[1+idx] = s2;
295053cca76cSShri Abhyankar        x[2+idx] = s3;
295153cca76cSShri Abhyankar        x[3+idx] = s4;
295253cca76cSShri Abhyankar        x[4+idx] = s5;
295353cca76cSShri Abhyankar        x[5+idx] = s6;
295453cca76cSShri Abhyankar     }
295553cca76cSShri Abhyankar 
295653cca76cSShri Abhyankar    /* backward solve the upper triangular */
295753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
295853cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
295953cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
296053cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
296153cca76cSShri Abhyankar      idt = bs*i;
296253cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
296353cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
296453cca76cSShri Abhyankar      for(k=0;k<nz;k++){
296553cca76cSShri Abhyankar       idx   = bs*vi[k];
296653cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
296753cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
296853cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
296953cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
297053cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
297153cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
297253cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
297353cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
297453cca76cSShri Abhyankar         v   +=  bs2;
297553cca76cSShri Abhyankar     }
297653cca76cSShri Abhyankar     /* x = inv_diagonal*x */
297753cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
297853cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
297953cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
298053cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
298153cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
298253cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
298353cca76cSShri Abhyankar   }
298453cca76cSShri Abhyankar 
298553cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
298653cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
298753cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
298853cca76cSShri Abhyankar   PetscFunctionReturn(0);
298953cca76cSShri Abhyankar }
299053cca76cSShri Abhyankar 
299153cca76cSShri Abhyankar #undef __FUNCT__
2992*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
2993*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
29944e2b4712SSatish Balay {
29954e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
29964e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
29976849ba73SBarry Smith   PetscErrorCode    ierr;
29985d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
29995d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
3000d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3001d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3002d9fead3dSBarry Smith   const PetscScalar *b;
30034e2b4712SSatish Balay 
30044e2b4712SSatish Balay   PetscFunctionBegin;
3005d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30061ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3007f1af5d2fSBarry Smith   t  = a->solve_work;
30084e2b4712SSatish Balay 
30094e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
30104e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
30114e2b4712SSatish Balay 
30124e2b4712SSatish Balay   /* forward solve the lower triangular */
30134e2b4712SSatish Balay   idx    = 5*(*r++);
3014f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3015f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
30164e2b4712SSatish Balay   for (i=1; i<n; i++) {
30174e2b4712SSatish Balay     v     = aa + 25*ai[i];
30184e2b4712SSatish Balay     vi    = aj + ai[i];
30194e2b4712SSatish Balay     nz    = diag[i] - ai[i];
30204e2b4712SSatish Balay     idx   = 5*(*r++);
3021f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3022f1af5d2fSBarry Smith     s5  = b[4+idx];
30234e2b4712SSatish Balay     while (nz--) {
30244e2b4712SSatish Balay       idx   = 5*(*vi++);
3025f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3026f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
3027f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3028f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3029f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3030f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3031f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
30324e2b4712SSatish Balay       v += 25;
30334e2b4712SSatish Balay     }
30344e2b4712SSatish Balay     idx = 5*i;
3035f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3036f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
30374e2b4712SSatish Balay   }
30384e2b4712SSatish Balay   /* backward solve the upper triangular */
30394e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
30404e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
30414e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
30424e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
30434e2b4712SSatish Balay     idt  = 5*i;
3044f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3045f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
30464e2b4712SSatish Balay     while (nz--) {
30474e2b4712SSatish Balay       idx   = 5*(*vi++);
3048f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3049f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3050f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3051f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3052f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3053f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3054f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
30554e2b4712SSatish Balay       v += 25;
30564e2b4712SSatish Balay     }
30574e2b4712SSatish Balay     idc = 5*(*c--);
30584e2b4712SSatish Balay     v   = aa + 25*diag[i];
3059f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3060f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
3061f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3062f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
3063f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3064f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
3065f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3066f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
3067f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3068f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
30694e2b4712SSatish Balay   }
30704e2b4712SSatish Balay 
30714e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30724e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3073d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30741ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3075dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
30764e2b4712SSatish Balay   PetscFunctionReturn(0);
30774e2b4712SSatish Balay }
30784e2b4712SSatish Balay 
307978bb4007SShri Abhyankar #undef __FUNCT__
3080a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_newdatastruct"
3081a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_newdatastruct(Mat A,Vec bb,Vec xx)
308278bb4007SShri Abhyankar {
308378bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
308478bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
308578bb4007SShri Abhyankar   PetscErrorCode    ierr;
308678bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
308778bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
308878bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
308978bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
309078bb4007SShri Abhyankar   const PetscScalar *b;
309178bb4007SShri Abhyankar 
309278bb4007SShri Abhyankar   PetscFunctionBegin;
309378bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
309478bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
309578bb4007SShri Abhyankar   t  = a->solve_work;
309678bb4007SShri Abhyankar 
309778bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
309878bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
309978bb4007SShri Abhyankar 
310078bb4007SShri Abhyankar   /* forward solve the lower triangular */
310178bb4007SShri Abhyankar   idx    = 5*r[0];
310278bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
310378bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
310478bb4007SShri Abhyankar   for (i=1; i<n; i++) {
310578bb4007SShri Abhyankar     v     = aa + 25*ai[i];
310678bb4007SShri Abhyankar     vi    = aj + ai[i];
310778bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
310878bb4007SShri Abhyankar     idx   = 5*r[i];
310978bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
311078bb4007SShri Abhyankar     s5  = b[4+idx];
311178bb4007SShri Abhyankar     for(m=0;m<nz;m++){
311278bb4007SShri Abhyankar       idx   = 5*vi[m];
311378bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
311478bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
311578bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
311678bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
311778bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
311878bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
311978bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
312078bb4007SShri Abhyankar       v += 25;
312178bb4007SShri Abhyankar     }
312278bb4007SShri Abhyankar     idx = 5*i;
312378bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
312478bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
312578bb4007SShri Abhyankar   }
312678bb4007SShri Abhyankar   /* backward solve the upper triangular */
312778bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
312878bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
312978bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
313078bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
313178bb4007SShri Abhyankar     idt  = 5*i;
313278bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
313378bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
313478bb4007SShri Abhyankar     for(m=0;m<nz;m++){
313578bb4007SShri Abhyankar       idx   = 5*vi[m];
313678bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
313778bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
313878bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
313978bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
314078bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
314178bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
314278bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
314378bb4007SShri Abhyankar       v += 25;
314478bb4007SShri Abhyankar     }
314578bb4007SShri Abhyankar     idc = 5*c[i];
314678bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
314778bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
314878bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
314978bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
315078bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
315178bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
315278bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
315378bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
315478bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
315578bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
315678bb4007SShri Abhyankar   }
315778bb4007SShri Abhyankar 
315878bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
315978bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
316078bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
316178bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
316278bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
316378bb4007SShri Abhyankar   PetscFunctionReturn(0);
316478bb4007SShri Abhyankar }
316578bb4007SShri Abhyankar 
31668f690400SShri Abhyankar #undef __FUNCT__
3167*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
3168*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
316915091d37SBarry Smith {
317015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3171690b6cddSBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt;
3172dfbe8321SBarry Smith   PetscErrorCode    ierr;
3173690b6cddSBarry Smith   PetscInt          *diag = a->diag,jdx;
3174d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3175d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3176d9fead3dSBarry Smith   const PetscScalar *b;
317715091d37SBarry Smith 
317815091d37SBarry Smith   PetscFunctionBegin;
3179d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31801ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
318115091d37SBarry Smith   /* forward solve the lower triangular */
318215091d37SBarry Smith   idx    = 0;
318315091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
318415091d37SBarry Smith   for (i=1; i<n; i++) {
318515091d37SBarry Smith     v     =  aa + 25*ai[i];
318615091d37SBarry Smith     vi    =  aj + ai[i];
318715091d37SBarry Smith     nz    =  diag[i] - ai[i];
318815091d37SBarry Smith     idx   =  5*i;
3189f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
319015091d37SBarry Smith     while (nz--) {
319115091d37SBarry Smith       jdx   = 5*(*vi++);
319215091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3193f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3194f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3195f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3196f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3197f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
319815091d37SBarry Smith       v    += 25;
319915091d37SBarry Smith     }
3200f1af5d2fSBarry Smith     x[idx]   = s1;
3201f1af5d2fSBarry Smith     x[1+idx] = s2;
3202f1af5d2fSBarry Smith     x[2+idx] = s3;
3203f1af5d2fSBarry Smith     x[3+idx] = s4;
3204f1af5d2fSBarry Smith     x[4+idx] = s5;
320515091d37SBarry Smith   }
320615091d37SBarry Smith   /* backward solve the upper triangular */
320715091d37SBarry Smith   for (i=n-1; i>=0; i--){
320815091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
320915091d37SBarry Smith     vi   = aj + diag[i] + 1;
321015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
321115091d37SBarry Smith     idt  = 5*i;
3212f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3213f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
321415091d37SBarry Smith     while (nz--) {
321515091d37SBarry Smith       idx   = 5*(*vi++);
321615091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3217f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3218f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3219f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3220f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3221f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
322215091d37SBarry Smith       v    += 25;
322315091d37SBarry Smith     }
322415091d37SBarry Smith     v        = aa + 25*diag[i];
3225f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3226f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3227f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3228f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3229f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
323015091d37SBarry Smith   }
323115091d37SBarry Smith 
3232d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3234dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
323515091d37SBarry Smith   PetscFunctionReturn(0);
323615091d37SBarry Smith }
323715091d37SBarry Smith 
3238cee9d6f2SShri Abhyankar #undef __FUNCT__
3239a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct"
3240a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
324153cca76cSShri Abhyankar {
324253cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
324353cca76cSShri Abhyankar   PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
324453cca76cSShri Abhyankar   PetscErrorCode    ierr;
324553cca76cSShri Abhyankar   PetscInt          jdx;
324653cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
324753cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
324853cca76cSShri Abhyankar   const PetscScalar *b;
324953cca76cSShri Abhyankar 
325053cca76cSShri Abhyankar   PetscFunctionBegin;
325153cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
325253cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
325353cca76cSShri Abhyankar   /* forward solve the lower triangular */
325453cca76cSShri Abhyankar   idx    = 0;
325553cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
325653cca76cSShri Abhyankar   for (i=1; i<n; i++) {
325753cca76cSShri Abhyankar     v   = aa + 25*ai[i];
325853cca76cSShri Abhyankar     vi  = aj + ai[i];
325953cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
326053cca76cSShri Abhyankar     idx = 5*i;
326153cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
326253cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
326353cca76cSShri Abhyankar       jdx   = 5*vi[k];
326453cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
326553cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
326653cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
326753cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
326853cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
326953cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
327053cca76cSShri Abhyankar       v    += 25;
327153cca76cSShri Abhyankar     }
327253cca76cSShri Abhyankar     x[idx]   = s1;
327353cca76cSShri Abhyankar     x[1+idx] = s2;
327453cca76cSShri Abhyankar     x[2+idx] = s3;
327553cca76cSShri Abhyankar     x[3+idx] = s4;
327653cca76cSShri Abhyankar     x[4+idx] = s5;
327753cca76cSShri Abhyankar   }
327853cca76cSShri Abhyankar 
327953cca76cSShri Abhyankar   /* backward solve the upper triangular */
328053cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
328153cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
328253cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
328353cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
328453cca76cSShri Abhyankar     idt = 5*i;
328553cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
328653cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
328753cca76cSShri Abhyankar     for(k=0;k<nz;k++){
328853cca76cSShri Abhyankar       idx   = 5*vi[k];
328953cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
329053cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
329153cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
329253cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
329353cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
329453cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
329553cca76cSShri Abhyankar       v    += 25;
329653cca76cSShri Abhyankar     }
329753cca76cSShri Abhyankar     /* x = inv_diagonal*x */
329853cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
329953cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
330053cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
330153cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
330253cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
330353cca76cSShri Abhyankar   }
330453cca76cSShri Abhyankar 
330553cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
330653cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
330753cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
330853cca76cSShri Abhyankar   PetscFunctionReturn(0);
330953cca76cSShri Abhyankar }
331053cca76cSShri Abhyankar 
331153cca76cSShri Abhyankar #undef __FUNCT__
3312*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
3313*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
33144e2b4712SSatish Balay {
33154e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
33164e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
33176849ba73SBarry Smith   PetscErrorCode    ierr;
33185d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
33195d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3320d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3321d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3322d9fead3dSBarry Smith   const PetscScalar *b;
33234e2b4712SSatish Balay 
33244e2b4712SSatish Balay   PetscFunctionBegin;
3325d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33261ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3327f1af5d2fSBarry Smith   t  = a->solve_work;
33284e2b4712SSatish Balay 
33294e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
33304e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
33314e2b4712SSatish Balay 
33324e2b4712SSatish Balay   /* forward solve the lower triangular */
33334e2b4712SSatish Balay   idx    = 4*(*r++);
3334f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3335f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
33364e2b4712SSatish Balay   for (i=1; i<n; i++) {
33374e2b4712SSatish Balay     v     = aa + 16*ai[i];
33384e2b4712SSatish Balay     vi    = aj + ai[i];
33394e2b4712SSatish Balay     nz    = diag[i] - ai[i];
33404e2b4712SSatish Balay     idx   = 4*(*r++);
3341f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
33424e2b4712SSatish Balay     while (nz--) {
33434e2b4712SSatish Balay       idx   = 4*(*vi++);
3344f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3345f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3346f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3347f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3348f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
33494e2b4712SSatish Balay       v    += 16;
33504e2b4712SSatish Balay     }
33514e2b4712SSatish Balay     idx        = 4*i;
3352f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3353f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
33544e2b4712SSatish Balay   }
33554e2b4712SSatish Balay   /* backward solve the upper triangular */
33564e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
33574e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
33584e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
33594e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
33604e2b4712SSatish Balay     idt  = 4*i;
3361f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3362f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
33634e2b4712SSatish Balay     while (nz--) {
33644e2b4712SSatish Balay       idx   = 4*(*vi++);
3365f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3366f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3367f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3368f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3369f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3370f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
33714e2b4712SSatish Balay       v += 16;
33724e2b4712SSatish Balay     }
33734e2b4712SSatish Balay     idc      = 4*(*c--);
33744e2b4712SSatish Balay     v        = aa + 16*diag[i];
3375f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3376f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3377f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3378f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
33794e2b4712SSatish Balay   }
33804e2b4712SSatish Balay 
33814e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
33824e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3383d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33841ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3385dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
33864e2b4712SSatish Balay   PetscFunctionReturn(0);
33874e2b4712SSatish Balay }
3388f26ec98cSKris Buschelman 
33898f690400SShri Abhyankar #undef __FUNCT__
3390a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_newdatastruct"
3391a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_newdatastruct(Mat A,Vec bb,Vec xx)
339278bb4007SShri Abhyankar {
339378bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
339478bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
339578bb4007SShri Abhyankar   PetscErrorCode    ierr;
339678bb4007SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
339778bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
339878bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
339978bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
340078bb4007SShri Abhyankar   const PetscScalar *b;
340178bb4007SShri Abhyankar 
340278bb4007SShri Abhyankar   PetscFunctionBegin;
340378bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
340478bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
340578bb4007SShri Abhyankar   t  = a->solve_work;
340678bb4007SShri Abhyankar 
340778bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
340878bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
340978bb4007SShri Abhyankar 
341078bb4007SShri Abhyankar   /* forward solve the lower triangular */
341178bb4007SShri Abhyankar   idx    = 4*r[0];
341278bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
341378bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
341478bb4007SShri Abhyankar   for (i=1; i<n; i++) {
341578bb4007SShri Abhyankar     v     = aa + 16*ai[i];
341678bb4007SShri Abhyankar     vi    = aj + ai[i];
341778bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
341878bb4007SShri Abhyankar     idx   = 4*r[i];
341978bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
342078bb4007SShri Abhyankar     for(m=0;m<nz;m++){
342178bb4007SShri Abhyankar       idx   = 4*vi[m];
342278bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
342378bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
342478bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
342578bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
342678bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
342778bb4007SShri Abhyankar       v    += 16;
342878bb4007SShri Abhyankar     }
342978bb4007SShri Abhyankar     idx        = 4*i;
343078bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
343178bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
343278bb4007SShri Abhyankar   }
343378bb4007SShri Abhyankar   /* backward solve the upper triangular */
343478bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
343578bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
343678bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
343778bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
343878bb4007SShri Abhyankar     idt  = 4*i;
343978bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
344078bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
344178bb4007SShri Abhyankar     for(m=0;m<nz;m++){
344278bb4007SShri Abhyankar       idx   = 4*vi[m];
344378bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
344478bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
344578bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
344678bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
344778bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
344878bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
344978bb4007SShri Abhyankar       v += 16;
345078bb4007SShri Abhyankar     }
345178bb4007SShri Abhyankar     idc      = 4*c[i];
345278bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
345378bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
345478bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
345578bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
345678bb4007SShri Abhyankar   }
345778bb4007SShri Abhyankar 
345878bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
345978bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
346078bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
346178bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
346278bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
346378bb4007SShri Abhyankar   PetscFunctionReturn(0);
346478bb4007SShri Abhyankar }
346578bb4007SShri Abhyankar 
346678bb4007SShri Abhyankar #undef __FUNCT__
3467f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3468dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3469f26ec98cSKris Buschelman {
3470f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3471f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
34726849ba73SBarry Smith   PetscErrorCode    ierr;
34735d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
34745d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3475d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3476d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3477d9fead3dSBarry Smith   PetscScalar       *x;
3478d9fead3dSBarry Smith   const PetscScalar *b;
3479f26ec98cSKris Buschelman 
3480f26ec98cSKris Buschelman   PetscFunctionBegin;
3481d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
34821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3483f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3484f26ec98cSKris Buschelman 
3485f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3486f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3487f26ec98cSKris Buschelman 
3488f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3489f26ec98cSKris Buschelman   idx    = 4*(*r++);
3490f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3491f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3492f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3493f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3494f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3495f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3496f26ec98cSKris Buschelman     vi    = aj + ai[i];
3497f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3498f26ec98cSKris Buschelman     idx   = 4*(*r++);
3499f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3500f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3501f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3502f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3503f26ec98cSKris Buschelman     while (nz--) {
3504f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3505f26ec98cSKris Buschelman       x1  = t[idx];
3506f26ec98cSKris Buschelman       x2  = t[1+idx];
3507f26ec98cSKris Buschelman       x3  = t[2+idx];
3508f26ec98cSKris Buschelman       x4  = t[3+idx];
3509f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3510f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3511f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3512f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3513f26ec98cSKris Buschelman       v    += 16;
3514f26ec98cSKris Buschelman     }
3515f26ec98cSKris Buschelman     idx        = 4*i;
3516f26ec98cSKris Buschelman     t[idx]   = s1;
3517f26ec98cSKris Buschelman     t[1+idx] = s2;
3518f26ec98cSKris Buschelman     t[2+idx] = s3;
3519f26ec98cSKris Buschelman     t[3+idx] = s4;
3520f26ec98cSKris Buschelman   }
3521f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3522f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
3523f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3524f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3525f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3526f26ec98cSKris Buschelman     idt  = 4*i;
3527f26ec98cSKris Buschelman     s1 = t[idt];
3528f26ec98cSKris Buschelman     s2 = t[1+idt];
3529f26ec98cSKris Buschelman     s3 = t[2+idt];
3530f26ec98cSKris Buschelman     s4 = t[3+idt];
3531f26ec98cSKris Buschelman     while (nz--) {
3532f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3533f26ec98cSKris Buschelman       x1  = t[idx];
3534f26ec98cSKris Buschelman       x2  = t[1+idx];
3535f26ec98cSKris Buschelman       x3  = t[2+idx];
3536f26ec98cSKris Buschelman       x4  = t[3+idx];
3537f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3538f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3539f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3540f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3541f26ec98cSKris Buschelman       v += 16;
3542f26ec98cSKris Buschelman     }
3543f26ec98cSKris Buschelman     idc      = 4*(*c--);
3544f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3545f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3546f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3547f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3548f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3549f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3550f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3551f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3552f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3553f26ec98cSKris Buschelman  }
3554f26ec98cSKris Buschelman 
3555f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3556f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3557d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3559dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3560f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3561f26ec98cSKris Buschelman }
3562f26ec98cSKris Buschelman 
356324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
356424c233c2SKris Buschelman 
356524c233c2SKris Buschelman #include PETSC_HAVE_SSE
356624c233c2SKris Buschelman 
356724c233c2SKris Buschelman #undef __FUNCT__
356824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3569dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
357024c233c2SKris Buschelman {
357124c233c2SKris Buschelman   /*
357224c233c2SKris Buschelman      Note: This code uses demotion of double
357324c233c2SKris Buschelman      to float when performing the mixed-mode computation.
357424c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
357524c233c2SKris Buschelman   */
357624c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
357724c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
35786849ba73SBarry Smith   PetscErrorCode ierr;
35795d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
35805d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
358124c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
358287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
358324c233c2SKris Buschelman 
358424c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
358524c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
358624c233c2SKris Buschelman   unsigned long   offset;
358724c233c2SKris Buschelman 
358824c233c2SKris Buschelman   PetscFunctionBegin;
358924c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
359024c233c2SKris Buschelman 
359124c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
359224c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
359324c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
359424c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
359524c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
359624c233c2SKris Buschelman 
35971ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
35981ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
359924c233c2SKris Buschelman     t  = a->solve_work;
360024c233c2SKris Buschelman 
360124c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
360224c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
360324c233c2SKris Buschelman 
360424c233c2SKris Buschelman     /* forward solve the lower triangular */
360524c233c2SKris Buschelman     idx  = 4*(*r++);
360624c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
360724c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
360824c233c2SKris Buschelman     v    =  aa + 16*ai[1];
360924c233c2SKris Buschelman 
361024c233c2SKris Buschelman     for (i=1; i<n;) {
361124c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
361224c233c2SKris Buschelman       vi   =  aj      + ai[i];
361324c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
361424c233c2SKris Buschelman       idx  =  4*(*r++);
361524c233c2SKris Buschelman 
361624c233c2SKris Buschelman       /* Demote sum from double to float */
361724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
361824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
361924c233c2SKris Buschelman 
362024c233c2SKris Buschelman       while (nz--) {
362124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
362224c233c2SKris Buschelman         idx = 4*(*vi++);
362324c233c2SKris Buschelman 
362424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
362524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
362624c233c2SKris Buschelman 
362724c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
362824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
362924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
363024c233c2SKris Buschelman 
363124c233c2SKris Buschelman           /* First Column */
363224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
363324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
363424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
363524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
363624c233c2SKris Buschelman 
363724c233c2SKris Buschelman           /* Second Column */
363824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
363924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
364024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
364124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
364224c233c2SKris Buschelman 
364324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
364424c233c2SKris Buschelman 
364524c233c2SKris Buschelman           /* Third Column */
364624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
364724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
364824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
364924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
365024c233c2SKris Buschelman 
365124c233c2SKris Buschelman           /* Fourth Column */
365224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
365324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
365424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
365524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
365624c233c2SKris Buschelman         SSE_INLINE_END_2
365724c233c2SKris Buschelman 
365824c233c2SKris Buschelman         v  += 16;
365924c233c2SKris Buschelman       }
366024c233c2SKris Buschelman       idx = 4*i;
366124c233c2SKris Buschelman       v   = aa + 16*ai[++i];
366224c233c2SKris Buschelman       PREFETCH_NTA(v);
366324c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
366424c233c2SKris Buschelman 
366524c233c2SKris Buschelman       /* Promote result from float to double */
366624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
366724c233c2SKris Buschelman     }
366824c233c2SKris Buschelman     /* backward solve the upper triangular */
366924c233c2SKris Buschelman     idt  = 4*(n-1);
367024c233c2SKris Buschelman     ai16 = 16*diag[n-1];
367124c233c2SKris Buschelman     v    = aa + ai16 + 16;
367224c233c2SKris Buschelman     for (i=n-1; i>=0;){
367324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
367424c233c2SKris Buschelman       vi = aj + diag[i] + 1;
367524c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
367624c233c2SKris Buschelman 
367724c233c2SKris Buschelman       /* Demote accumulator from double to float */
367824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
367924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
368024c233c2SKris Buschelman 
368124c233c2SKris Buschelman       while (nz--) {
368224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
368324c233c2SKris Buschelman         idx = 4*(*vi++);
368424c233c2SKris Buschelman 
368524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
368624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
368724c233c2SKris Buschelman 
368824c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
368924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
369024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
369124c233c2SKris Buschelman 
369224c233c2SKris Buschelman           /* First Column */
369324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
369424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
369524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
369624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
369724c233c2SKris Buschelman 
369824c233c2SKris Buschelman           /* Second Column */
369924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
370024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
370124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
370224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
370324c233c2SKris Buschelman 
370424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
370524c233c2SKris Buschelman 
370624c233c2SKris Buschelman           /* Third Column */
370724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
370824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
370924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
371024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
371124c233c2SKris Buschelman 
371224c233c2SKris Buschelman           /* Fourth Column */
371324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
371424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
371524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
371624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
371724c233c2SKris Buschelman         SSE_INLINE_END_2
371824c233c2SKris Buschelman         v  += 16;
371924c233c2SKris Buschelman       }
372024c233c2SKris Buschelman       v    = aa + ai16;
372124c233c2SKris Buschelman       ai16 = 16*diag[--i];
372224c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
372324c233c2SKris Buschelman       /*
372424c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
372524c233c2SKris Buschelman          which was inverted as part of the factorization
372624c233c2SKris Buschelman       */
372724c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
372824c233c2SKris Buschelman         /* First Column */
372924c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
373024c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
373124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
373224c233c2SKris Buschelman 
373324c233c2SKris Buschelman         /* Second Column */
373424c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
373524c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
373624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
373724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
373824c233c2SKris Buschelman 
373924c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
374024c233c2SKris Buschelman 
374124c233c2SKris Buschelman         /* Third Column */
374224c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
374324c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
374424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
374524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
374624c233c2SKris Buschelman 
374724c233c2SKris Buschelman         /* Fourth Column */
374824c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
374924c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
375024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
375124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
375224c233c2SKris Buschelman 
375324c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
375424c233c2SKris Buschelman       SSE_INLINE_END_3
375524c233c2SKris Buschelman 
375624c233c2SKris Buschelman       /* Promote solution from float to double */
375724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
375824c233c2SKris Buschelman 
375924c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
376024c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
376124c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
376224c233c2SKris Buschelman       idc  = 4*(*c--);
376324c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
376424c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
376524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
376624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
376724c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
376824c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
376924c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
377024c233c2SKris Buschelman       SSE_INLINE_END_2
377124c233c2SKris Buschelman       v    = aa + ai16 + 16;
377224c233c2SKris Buschelman       idt -= 4;
377324c233c2SKris Buschelman     }
377424c233c2SKris Buschelman 
377524c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
377624c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
37771ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
37781ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3779dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
378024c233c2SKris Buschelman   SSE_SCOPE_END;
378124c233c2SKris Buschelman   PetscFunctionReturn(0);
378224c233c2SKris Buschelman }
378324c233c2SKris Buschelman 
378424c233c2SKris Buschelman #endif
37850ef38995SBarry Smith 
37860ef38995SBarry Smith 
37874e2b4712SSatish Balay /*
37884e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
37894e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
37904e2b4712SSatish Balay */
37914a2ae208SSatish Balay #undef __FUNCT__
3792*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
3793*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
37944e2b4712SSatish Balay {
37954e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3796356650c2SBarry Smith   PetscInt          n=a->mbs;
3797356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
3798dfbe8321SBarry Smith   PetscErrorCode    ierr;
3799356650c2SBarry Smith   const PetscInt    *diag = a->diag;
3800d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
3801d9fead3dSBarry Smith   PetscScalar       *x;
3802d9fead3dSBarry Smith   const PetscScalar *b;
38034e2b4712SSatish Balay 
38044e2b4712SSatish Balay   PetscFunctionBegin;
3805d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38061ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
38074e2b4712SSatish Balay 
3808aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
38092853dc0eSBarry Smith   {
381087828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
38112853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
38122853dc0eSBarry Smith   }
3813aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
38142853dc0eSBarry Smith   {
381587828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
38162853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
38172853dc0eSBarry Smith   }
3818aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
38192853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
3820e1293385SBarry Smith #else
382130d4dcafSBarry Smith   {
382287828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
3823d9fead3dSBarry Smith     const MatScalar *v;
3824356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
3825356650c2SBarry Smith     const PetscInt  *vi;
3826e1293385SBarry Smith 
38274e2b4712SSatish Balay   /* forward solve the lower triangular */
38284e2b4712SSatish Balay   idx    = 0;
3829e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
38304e2b4712SSatish Balay   for (i=1; i<n; i++) {
38314e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
38324e2b4712SSatish Balay     vi    =  aj      + ai[i];
38334e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
3834e1293385SBarry Smith     idx   +=  4;
3835f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
38364e2b4712SSatish Balay     while (nz--) {
38374e2b4712SSatish Balay       jdx   = 4*(*vi++);
38384e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
3839f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3840f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3841f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3842f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
38434e2b4712SSatish Balay       v    += 16;
38444e2b4712SSatish Balay     }
3845f1af5d2fSBarry Smith     x[idx]   = s1;
3846f1af5d2fSBarry Smith     x[1+idx] = s2;
3847f1af5d2fSBarry Smith     x[2+idx] = s3;
3848f1af5d2fSBarry Smith     x[3+idx] = s4;
38494e2b4712SSatish Balay   }
38504e2b4712SSatish Balay   /* backward solve the upper triangular */
38514e555682SBarry Smith   idt = 4*(n-1);
38524e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
38534e555682SBarry Smith     ai16 = 16*diag[i];
38544e555682SBarry Smith     v    = aa + ai16 + 16;
38554e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
38564e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
3857f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3858f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
38594e2b4712SSatish Balay     while (nz--) {
38604e2b4712SSatish Balay       idx   = 4*(*vi++);
38614e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
3862f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3863f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3864f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3865f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
38664e2b4712SSatish Balay       v    += 16;
38674e2b4712SSatish Balay     }
38684e555682SBarry Smith     v        = aa + ai16;
3869f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
3870f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
3871f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3872f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3873329f5518SBarry Smith     idt -= 4;
38744e2b4712SSatish Balay   }
387530d4dcafSBarry Smith   }
3876e1293385SBarry Smith #endif
38774e2b4712SSatish Balay 
3878d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38791ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3880dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
38814e2b4712SSatish Balay   PetscFunctionReturn(0);
38824e2b4712SSatish Balay }
38834e2b4712SSatish Balay 
3884b2b2dd24SShri Abhyankar #undef __FUNCT__
3885a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct"
3886a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
3887b2b2dd24SShri Abhyankar {
3888b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3889b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
3890b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
3891b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
3892b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
3893b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
3894b2b2dd24SShri Abhyankar     PetscScalar       *x;
3895b2b2dd24SShri Abhyankar     const PetscScalar *b;
3896b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,s4,x1,x2,x3,x4;
3897cee9d6f2SShri Abhyankar 
3898b2b2dd24SShri Abhyankar     PetscFunctionBegin;
3899b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3900b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3901b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
3902b2b2dd24SShri Abhyankar     idx    = 0;
3903b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
3904b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
3905b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
3906b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
3907b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
3908b2b2dd24SShri Abhyankar       idx   = bs*i;
3909b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3910b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
3911b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
3912b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
3913b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3914b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3915b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3916b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3917b2b2dd24SShri Abhyankar 
3918b2b2dd24SShri Abhyankar           v   +=  bs2;
3919b2b2dd24SShri Abhyankar         }
3920b2b2dd24SShri Abhyankar 
3921b2b2dd24SShri Abhyankar        x[idx]   = s1;
3922b2b2dd24SShri Abhyankar        x[1+idx] = s2;
3923b2b2dd24SShri Abhyankar        x[2+idx] = s3;
3924b2b2dd24SShri Abhyankar        x[3+idx] = s4;
3925b2b2dd24SShri Abhyankar     }
3926b2b2dd24SShri Abhyankar 
3927b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
3928b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
3929b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
3930b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
3931b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
3932b2b2dd24SShri Abhyankar      idt = bs*i;
3933b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
3934b2b2dd24SShri Abhyankar 
3935b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
3936b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
3937b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
3938b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
3939b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
3940b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3941b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3942b2b2dd24SShri Abhyankar 
3943b2b2dd24SShri Abhyankar         v   +=  bs2;
3944b2b2dd24SShri Abhyankar     }
3945b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
3946b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
3947b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
3948b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
3949b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
3950b2b2dd24SShri Abhyankar 
3951b2b2dd24SShri Abhyankar   }
3952b2b2dd24SShri Abhyankar 
3953b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3954b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3955b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
3956b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
3957b2b2dd24SShri Abhyankar }
3958cee9d6f2SShri Abhyankar 
3959cee9d6f2SShri Abhyankar #undef __FUNCT__
3960f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
3961dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
3962f26ec98cSKris Buschelman {
3963f26ec98cSKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
3964690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
3965dfbe8321SBarry Smith   PetscErrorCode ierr;
3966690b6cddSBarry Smith   PetscInt       *diag = a->diag;
3967f26ec98cSKris Buschelman   MatScalar      *aa=a->a;
3968f26ec98cSKris Buschelman   PetscScalar    *x,*b;
3969f26ec98cSKris Buschelman 
3970f26ec98cSKris Buschelman   PetscFunctionBegin;
39711ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
39721ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3973f26ec98cSKris Buschelman 
3974f26ec98cSKris Buschelman   {
3975f26ec98cSKris Buschelman     MatScalar  s1,s2,s3,s4,x1,x2,x3,x4;
3976f26ec98cSKris Buschelman     MatScalar  *v,*t=(MatScalar *)x;
3977690b6cddSBarry Smith     PetscInt   jdx,idt,idx,nz,*vi,i,ai16;
3978f26ec98cSKris Buschelman 
3979f26ec98cSKris Buschelman     /* forward solve the lower triangular */
3980f26ec98cSKris Buschelman     idx  = 0;
3981f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
3982f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
3983f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
3984f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
3985f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
3986f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
3987f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
3988f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
3989f26ec98cSKris Buschelman       idx   +=  4;
3990f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
3991f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
3992f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
3993f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
3994f26ec98cSKris Buschelman       while (nz--) {
3995f26ec98cSKris Buschelman         jdx = 4*(*vi++);
3996f26ec98cSKris Buschelman         x1  = t[jdx];
3997f26ec98cSKris Buschelman         x2  = t[1+jdx];
3998f26ec98cSKris Buschelman         x3  = t[2+jdx];
3999f26ec98cSKris Buschelman         x4  = t[3+jdx];
4000f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4001f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4002f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4003f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4004f26ec98cSKris Buschelman         v    += 16;
4005f26ec98cSKris Buschelman       }
4006f26ec98cSKris Buschelman       t[idx]   = s1;
4007f26ec98cSKris Buschelman       t[1+idx] = s2;
4008f26ec98cSKris Buschelman       t[2+idx] = s3;
4009f26ec98cSKris Buschelman       t[3+idx] = s4;
4010f26ec98cSKris Buschelman     }
4011f26ec98cSKris Buschelman     /* backward solve the upper triangular */
4012f26ec98cSKris Buschelman     idt = 4*(n-1);
4013f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
4014f26ec98cSKris Buschelman       ai16 = 16*diag[i];
4015f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
4016f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
4017f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
4018f26ec98cSKris Buschelman       s1   = t[idt];
4019f26ec98cSKris Buschelman       s2   = t[1+idt];
4020f26ec98cSKris Buschelman       s3   = t[2+idt];
4021f26ec98cSKris Buschelman       s4   = t[3+idt];
4022f26ec98cSKris Buschelman       while (nz--) {
4023f26ec98cSKris Buschelman         idx = 4*(*vi++);
4024f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
4025f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
4026f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
4027f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
4028f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4029f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4030f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4031f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4032f26ec98cSKris Buschelman         v    += 16;
4033f26ec98cSKris Buschelman       }
4034f26ec98cSKris Buschelman       v        = aa + ai16;
4035f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4036f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4037f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4038f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4039f26ec98cSKris Buschelman       idt -= 4;
4040f26ec98cSKris Buschelman     }
4041f26ec98cSKris Buschelman   }
4042f26ec98cSKris Buschelman 
40431ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
40441ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4045dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4046f26ec98cSKris Buschelman   PetscFunctionReturn(0);
4047f26ec98cSKris Buschelman }
4048f26ec98cSKris Buschelman 
40493660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
40503660e330SKris Buschelman 
40513660e330SKris Buschelman #include PETSC_HAVE_SSE
40523660e330SKris Buschelman #undef __FUNCT__
40537cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4054dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
40553660e330SKris Buschelman {
40563660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
40572aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
4058dfbe8321SBarry Smith   PetscErrorCode ierr;
4059dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
40603660e330SKris Buschelman   MatScalar      *aa=a->a;
406187828ca2SBarry Smith   PetscScalar    *x,*b;
40623660e330SKris Buschelman 
40633660e330SKris Buschelman   PetscFunctionBegin;
40643660e330SKris Buschelman   SSE_SCOPE_BEGIN;
40653660e330SKris Buschelman   /*
40663660e330SKris Buschelman      Note: This code currently uses demotion of double
40673660e330SKris Buschelman      to float when performing the mixed-mode computation.
40683660e330SKris Buschelman      This may not be numerically reasonable for all applications.
40693660e330SKris Buschelman   */
40703660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
40713660e330SKris Buschelman 
40721ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
40731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
40743660e330SKris Buschelman   {
4075eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
4076eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
40772aa5897fSKris Buschelman     int            nz,i,idt,ai16;
40782aa5897fSKris Buschelman     unsigned int   jdx,idx;
40792aa5897fSKris Buschelman     unsigned short *vi;
4080eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
40813660e330SKris Buschelman 
4082eb05f457SKris Buschelman     /* First block is the identity. */
40833660e330SKris Buschelman     idx  = 0;
4084eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
40852aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
40863660e330SKris Buschelman 
40873660e330SKris Buschelman     for (i=1; i<n;) {
40883660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
40893660e330SKris Buschelman       vi   =  aj      + ai[i];
40903660e330SKris Buschelman       nz   =  diag[i] - ai[i];
40913660e330SKris Buschelman       idx +=  4;
40923660e330SKris Buschelman 
4093eb05f457SKris Buschelman       /* Demote RHS from double to float. */
4094eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4095eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
40963660e330SKris Buschelman 
40973660e330SKris Buschelman       while (nz--) {
40983660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
40992aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
41003660e330SKris Buschelman 
41013660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
4102eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
41033660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
41043660e330SKris Buschelman 
41053660e330SKris Buschelman           /* First Column */
41063660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
41073660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
41083660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
41093660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
41103660e330SKris Buschelman 
41113660e330SKris Buschelman           /* Second Column */
41123660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
41133660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
41143660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
41153660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
41163660e330SKris Buschelman 
41173660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
41183660e330SKris Buschelman 
41193660e330SKris Buschelman           /* Third Column */
41203660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
41213660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
41223660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
41233660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
41243660e330SKris Buschelman 
41253660e330SKris Buschelman           /* Fourth Column */
41263660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
41273660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
41283660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
41293660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
41303660e330SKris Buschelman         SSE_INLINE_END_2
41313660e330SKris Buschelman 
41323660e330SKris Buschelman         v  += 16;
41333660e330SKris Buschelman       }
41343660e330SKris Buschelman       v    =  aa + 16*ai[++i];
41353660e330SKris Buschelman       PREFETCH_NTA(v);
4136eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
41373660e330SKris Buschelman     }
4138eb05f457SKris Buschelman 
4139eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
4140eb05f457SKris Buschelman 
41413660e330SKris Buschelman     idt  = 4*(n-1);
41423660e330SKris Buschelman     ai16 = 16*diag[n-1];
41433660e330SKris Buschelman     v    = aa + ai16 + 16;
41443660e330SKris Buschelman     for (i=n-1; i>=0;){
41453660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
41463660e330SKris Buschelman       vi = aj + diag[i] + 1;
41473660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
41483660e330SKris Buschelman 
4149eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
41503660e330SKris Buschelman 
41513660e330SKris Buschelman       while (nz--) {
41523660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
41532aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
41543660e330SKris Buschelman 
41553660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
4156eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
41573660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
41583660e330SKris Buschelman 
41593660e330SKris Buschelman           /* First Column */
41603660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
41613660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
41623660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
41633660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
41643660e330SKris Buschelman 
41653660e330SKris Buschelman           /* Second Column */
41663660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
41673660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
41683660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
41693660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
41703660e330SKris Buschelman 
41713660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
41723660e330SKris Buschelman 
41733660e330SKris Buschelman           /* Third Column */
41743660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
41753660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
41763660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
41773660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
41783660e330SKris Buschelman 
41793660e330SKris Buschelman           /* Fourth Column */
41803660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
41813660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
41823660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
41833660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
41843660e330SKris Buschelman         SSE_INLINE_END_2
41853660e330SKris Buschelman         v  += 16;
41863660e330SKris Buschelman       }
41873660e330SKris Buschelman       v    = aa + ai16;
41883660e330SKris Buschelman       ai16 = 16*diag[--i];
41893660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
41903660e330SKris Buschelman       /*
41913660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
41923660e330SKris Buschelman          which was inverted as part of the factorization
41933660e330SKris Buschelman       */
4194eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
41953660e330SKris Buschelman         /* First Column */
41963660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
41973660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
41983660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
41993660e330SKris Buschelman 
42003660e330SKris Buschelman         /* Second Column */
42013660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
42023660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
42033660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
42043660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
42053660e330SKris Buschelman 
42063660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
42073660e330SKris Buschelman 
42083660e330SKris Buschelman         /* Third Column */
42093660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
42103660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
42113660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
42123660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
42133660e330SKris Buschelman 
42143660e330SKris Buschelman         /* Fourth Column */
42153660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
42163660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
42173660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
42183660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
42193660e330SKris Buschelman 
42203660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
42213660e330SKris Buschelman       SSE_INLINE_END_3
42223660e330SKris Buschelman 
42233660e330SKris Buschelman       v    = aa + ai16 + 16;
42243660e330SKris Buschelman       idt -= 4;
42253660e330SKris Buschelman     }
4226eb05f457SKris Buschelman 
4227eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
4228eb05f457SKris Buschelman     idt = 4*(n-1);
4229eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
4230eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4231eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4232eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
4233eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
4234eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
4235eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
4236eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
4237eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
423854693613SKris Buschelman       idt -= 4;
42393660e330SKris Buschelman     }
4240eb05f457SKris Buschelman 
4241eb05f457SKris Buschelman   } /* End of artificial scope. */
42421ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
42431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4244dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
42453660e330SKris Buschelman   SSE_SCOPE_END;
42463660e330SKris Buschelman   PetscFunctionReturn(0);
42473660e330SKris Buschelman }
42483660e330SKris Buschelman 
42497cf1b8d3SKris Buschelman #undef __FUNCT__
42507cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4251dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
42527cf1b8d3SKris Buschelman {
42537cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
42547cf1b8d3SKris Buschelman   int            *aj=a->j;
4255dfbe8321SBarry Smith   PetscErrorCode ierr;
4256dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
42577cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
42587cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
42597cf1b8d3SKris Buschelman 
42607cf1b8d3SKris Buschelman   PetscFunctionBegin;
42617cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
42627cf1b8d3SKris Buschelman   /*
42637cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
42647cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
42657cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
42667cf1b8d3SKris Buschelman   */
42677cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
42687cf1b8d3SKris Buschelman 
42691ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
42701ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
42717cf1b8d3SKris Buschelman   {
42727cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
42737cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
42747cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
42757cf1b8d3SKris Buschelman     int       jdx,idx;
42767cf1b8d3SKris Buschelman     int       *vi;
42777cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
42787cf1b8d3SKris Buschelman 
42797cf1b8d3SKris Buschelman     /* First block is the identity. */
42807cf1b8d3SKris Buschelman     idx  = 0;
42817cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
42827cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
42837cf1b8d3SKris Buschelman 
42847cf1b8d3SKris Buschelman     for (i=1; i<n;) {
42857cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
42867cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
42877cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
42887cf1b8d3SKris Buschelman       idx +=  4;
42897cf1b8d3SKris Buschelman 
42907cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
42917cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
42927cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
42937cf1b8d3SKris Buschelman 
42947cf1b8d3SKris Buschelman       while (nz--) {
42957cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
42967cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
42977cf1b8d3SKris Buschelman /*          jdx = *vi++; */
42987cf1b8d3SKris Buschelman 
42997cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
43007cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
43017cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
43027cf1b8d3SKris Buschelman 
43037cf1b8d3SKris Buschelman           /* First Column */
43047cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
43057cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
43067cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
43077cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
43087cf1b8d3SKris Buschelman 
43097cf1b8d3SKris Buschelman           /* Second Column */
43107cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
43117cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
43127cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
43137cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
43147cf1b8d3SKris Buschelman 
43157cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
43167cf1b8d3SKris Buschelman 
43177cf1b8d3SKris Buschelman           /* Third Column */
43187cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
43197cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
43207cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
43217cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
43227cf1b8d3SKris Buschelman 
43237cf1b8d3SKris Buschelman           /* Fourth Column */
43247cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
43257cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
43267cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
43277cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
43287cf1b8d3SKris Buschelman         SSE_INLINE_END_2
43297cf1b8d3SKris Buschelman 
43307cf1b8d3SKris Buschelman         v  += 16;
43317cf1b8d3SKris Buschelman       }
43327cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
43337cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
43347cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
43357cf1b8d3SKris Buschelman     }
43367cf1b8d3SKris Buschelman 
43377cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
43387cf1b8d3SKris Buschelman 
43397cf1b8d3SKris Buschelman     idt  = 4*(n-1);
43407cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
43417cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
43427cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
43437cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
43447cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
43457cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
43467cf1b8d3SKris Buschelman 
43477cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
43487cf1b8d3SKris Buschelman 
43497cf1b8d3SKris Buschelman       while (nz--) {
43507cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
43517cf1b8d3SKris Buschelman         idx = 4*(*vi++);
43527cf1b8d3SKris Buschelman /*          idx = *vi++; */
43537cf1b8d3SKris Buschelman 
43547cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
43557cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
43567cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
43577cf1b8d3SKris Buschelman 
43587cf1b8d3SKris Buschelman           /* First Column */
43597cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
43607cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
43617cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
43627cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
43637cf1b8d3SKris Buschelman 
43647cf1b8d3SKris Buschelman           /* Second Column */
43657cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
43667cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
43677cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
43687cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
43697cf1b8d3SKris Buschelman 
43707cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
43717cf1b8d3SKris Buschelman 
43727cf1b8d3SKris Buschelman           /* Third Column */
43737cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
43747cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
43757cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
43767cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
43777cf1b8d3SKris Buschelman 
43787cf1b8d3SKris Buschelman           /* Fourth Column */
43797cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
43807cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
43817cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
43827cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
43837cf1b8d3SKris Buschelman         SSE_INLINE_END_2
43847cf1b8d3SKris Buschelman         v  += 16;
43857cf1b8d3SKris Buschelman       }
43867cf1b8d3SKris Buschelman       v    = aa + ai16;
43877cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
43887cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
43897cf1b8d3SKris Buschelman       /*
43907cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
43917cf1b8d3SKris Buschelman          which was inverted as part of the factorization
43927cf1b8d3SKris Buschelman       */
43937cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
43947cf1b8d3SKris Buschelman         /* First Column */
43957cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
43967cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
43977cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
43987cf1b8d3SKris Buschelman 
43997cf1b8d3SKris Buschelman         /* Second Column */
44007cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
44017cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
44027cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
44037cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
44047cf1b8d3SKris Buschelman 
44057cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
44067cf1b8d3SKris Buschelman 
44077cf1b8d3SKris Buschelman         /* Third Column */
44087cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
44097cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
44107cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
44117cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
44127cf1b8d3SKris Buschelman 
44137cf1b8d3SKris Buschelman         /* Fourth Column */
44147cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
44157cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
44167cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
44177cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
44187cf1b8d3SKris Buschelman 
44197cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
44207cf1b8d3SKris Buschelman       SSE_INLINE_END_3
44217cf1b8d3SKris Buschelman 
44227cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
44237cf1b8d3SKris Buschelman       idt -= 4;
44247cf1b8d3SKris Buschelman     }
44257cf1b8d3SKris Buschelman 
44267cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
44277cf1b8d3SKris Buschelman     idt = 4*(n-1);
44287cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
44297cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
44307cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
44317cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
44327cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
44337cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
44347cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
44357cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
44367cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
44377cf1b8d3SKris Buschelman       idt -= 4;
44387cf1b8d3SKris Buschelman     }
44397cf1b8d3SKris Buschelman 
44407cf1b8d3SKris Buschelman   } /* End of artificial scope. */
44411ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
44421ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4443dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
44447cf1b8d3SKris Buschelman   SSE_SCOPE_END;
44457cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
44467cf1b8d3SKris Buschelman }
44477cf1b8d3SKris Buschelman 
44483660e330SKris Buschelman #endif
44498f690400SShri Abhyankar 
44504a2ae208SSatish Balay #undef __FUNCT__
4451*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
4452*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
44534e2b4712SSatish Balay {
44544e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
44554e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
44566849ba73SBarry Smith   PetscErrorCode    ierr;
44575d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
44585d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4459d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4460d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4461d9fead3dSBarry Smith   const PetscScalar *b;
44624e2b4712SSatish Balay 
44634e2b4712SSatish Balay   PetscFunctionBegin;
4464d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
44651ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4466f1af5d2fSBarry Smith   t  = a->solve_work;
44674e2b4712SSatish Balay 
44684e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
44694e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
44704e2b4712SSatish Balay 
44714e2b4712SSatish Balay   /* forward solve the lower triangular */
44724e2b4712SSatish Balay   idx    = 3*(*r++);
4473f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
44744e2b4712SSatish Balay   for (i=1; i<n; i++) {
44754e2b4712SSatish Balay     v     = aa + 9*ai[i];
44764e2b4712SSatish Balay     vi    = aj + ai[i];
44774e2b4712SSatish Balay     nz    = diag[i] - ai[i];
44784e2b4712SSatish Balay     idx   = 3*(*r++);
4479f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
44804e2b4712SSatish Balay     while (nz--) {
44814e2b4712SSatish Balay       idx   = 3*(*vi++);
4482f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4483f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4484f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4485f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
44864e2b4712SSatish Balay       v += 9;
44874e2b4712SSatish Balay     }
44884e2b4712SSatish Balay     idx = 3*i;
4489f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
44904e2b4712SSatish Balay   }
44914e2b4712SSatish Balay   /* backward solve the upper triangular */
44924e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
44934e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
44944e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
44954e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
44964e2b4712SSatish Balay     idt  = 3*i;
4497f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
44984e2b4712SSatish Balay     while (nz--) {
44994e2b4712SSatish Balay       idx   = 3*(*vi++);
4500f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4501f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4502f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4503f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
45044e2b4712SSatish Balay       v += 9;
45054e2b4712SSatish Balay     }
45064e2b4712SSatish Balay     idc = 3*(*c--);
45074e2b4712SSatish Balay     v   = aa + 9*diag[i];
4508f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4509f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4510f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
45114e2b4712SSatish Balay   }
45124e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45134e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4514d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45151ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4516dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
45174e2b4712SSatish Balay   PetscFunctionReturn(0);
45184e2b4712SSatish Balay }
45194e2b4712SSatish Balay 
45200c4413a7SShri Abhyankar #undef __FUNCT__
4521a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_newdatastruct"
4522a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_newdatastruct(Mat A,Vec bb,Vec xx)
45230c4413a7SShri Abhyankar {
45240c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
45250c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
45260c4413a7SShri Abhyankar   PetscErrorCode    ierr;
45270c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt,idc,m;
45280c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
45290c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
45300c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
45310c4413a7SShri Abhyankar   const PetscScalar *b;
45320c4413a7SShri Abhyankar 
45330c4413a7SShri Abhyankar   PetscFunctionBegin;
45340c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45350c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
45360c4413a7SShri Abhyankar   t  = a->solve_work;
45370c4413a7SShri Abhyankar 
45380c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
45390c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
45400c4413a7SShri Abhyankar 
45410c4413a7SShri Abhyankar   /* forward solve the lower triangular */
45420c4413a7SShri Abhyankar   idx    = 3*r[0];
45430c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
45440c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
45450c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
45460c4413a7SShri Abhyankar     vi    = aj + ai[i];
45470c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
45480c4413a7SShri Abhyankar     idx   = 3*r[i];
45490c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
45500c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
45510c4413a7SShri Abhyankar       idx   = 3*vi[m];
45520c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
45530c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
45540c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
45550c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
45560c4413a7SShri Abhyankar       v += 9;
45570c4413a7SShri Abhyankar     }
45580c4413a7SShri Abhyankar     idx = 3*i;
45590c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
45600c4413a7SShri Abhyankar   }
45610c4413a7SShri Abhyankar   /* backward solve the upper triangular */
45620c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
45630c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
45640c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
45650c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
45660c4413a7SShri Abhyankar     idt  = 3*i;
45670c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
45680c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
45690c4413a7SShri Abhyankar       idx   = 3*vi[m];
45700c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
45710c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
45720c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
45730c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
45740c4413a7SShri Abhyankar       v += 9;
45750c4413a7SShri Abhyankar     }
45760c4413a7SShri Abhyankar     idc = 3*c[i];
45770c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
45780c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
45790c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
45800c4413a7SShri Abhyankar   }
45810c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
45820c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
45830c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
45840c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
45850c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
45860c4413a7SShri Abhyankar   PetscFunctionReturn(0);
45870c4413a7SShri Abhyankar }
45880c4413a7SShri Abhyankar 
458915091d37SBarry Smith /*
459015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
459115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
459215091d37SBarry Smith */
45934a2ae208SSatish Balay #undef __FUNCT__
4594*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
4595*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
459615091d37SBarry Smith {
459715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4598690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4599dfbe8321SBarry Smith   PetscErrorCode    ierr;
4600690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4601d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4602d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4603d9fead3dSBarry Smith   const PetscScalar *b;
4604690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
460515091d37SBarry Smith 
460615091d37SBarry Smith   PetscFunctionBegin;
4607d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46081ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
460915091d37SBarry Smith 
461015091d37SBarry Smith   /* forward solve the lower triangular */
461115091d37SBarry Smith   idx    = 0;
461215091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
461315091d37SBarry Smith   for (i=1; i<n; i++) {
461415091d37SBarry Smith     v     =  aa      + 9*ai[i];
461515091d37SBarry Smith     vi    =  aj      + ai[i];
461615091d37SBarry Smith     nz    =  diag[i] - ai[i];
461715091d37SBarry Smith     idx   +=  3;
4618f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
461915091d37SBarry Smith     while (nz--) {
462015091d37SBarry Smith       jdx   = 3*(*vi++);
462115091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4622f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4623f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4624f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
462515091d37SBarry Smith       v    += 9;
462615091d37SBarry Smith     }
4627f1af5d2fSBarry Smith     x[idx]   = s1;
4628f1af5d2fSBarry Smith     x[1+idx] = s2;
4629f1af5d2fSBarry Smith     x[2+idx] = s3;
463015091d37SBarry Smith   }
463115091d37SBarry Smith   /* backward solve the upper triangular */
463215091d37SBarry Smith   for (i=n-1; i>=0; i--){
463315091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
463415091d37SBarry Smith     vi   = aj + diag[i] + 1;
463515091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
463615091d37SBarry Smith     idt  = 3*i;
4637f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4638f1af5d2fSBarry Smith     s3 = x[2+idt];
463915091d37SBarry Smith     while (nz--) {
464015091d37SBarry Smith       idx   = 3*(*vi++);
464115091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4642f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4643f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4644f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
464515091d37SBarry Smith       v    += 9;
464615091d37SBarry Smith     }
464715091d37SBarry Smith     v        = aa +  9*diag[i];
4648f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4649f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4650f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
465115091d37SBarry Smith   }
465215091d37SBarry Smith 
4653d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
46541ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4655dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
465615091d37SBarry Smith   PetscFunctionReturn(0);
465715091d37SBarry Smith }
465815091d37SBarry Smith 
4659cee9d6f2SShri Abhyankar #undef __FUNCT__
4660a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct"
4661a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4662b2b2dd24SShri Abhyankar {
4663b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4664b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz;
4665b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4666b2b2dd24SShri Abhyankar     PetscInt          idx,jdx,idt;
4667b2b2dd24SShri Abhyankar     PetscInt          bs = A->rmap->bs,bs2 = a->bs2;
4668b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4669b2b2dd24SShri Abhyankar     PetscScalar       *x;
4670b2b2dd24SShri Abhyankar     const PetscScalar *b;
4671b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4672b2b2dd24SShri Abhyankar 
4673b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4674b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4675b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4676b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4677b2b2dd24SShri Abhyankar     idx    = 0;
4678b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4679b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4680b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4681b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4682b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4683b2b2dd24SShri Abhyankar       idx   = bs*i;
4684b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4685b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4686b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4687b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4688b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4689b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4690b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4691b2b2dd24SShri Abhyankar 
4692b2b2dd24SShri Abhyankar           v   +=  bs2;
4693b2b2dd24SShri Abhyankar         }
4694b2b2dd24SShri Abhyankar 
4695b2b2dd24SShri Abhyankar        x[idx]   = s1;
4696b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4697b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4698b2b2dd24SShri Abhyankar     }
4699b2b2dd24SShri Abhyankar 
4700b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4701b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4702b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4703b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4704b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4705b2b2dd24SShri Abhyankar      idt = bs*i;
4706b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4707b2b2dd24SShri Abhyankar 
4708b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4709b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4710b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4711b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4712b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4713b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4714b2b2dd24SShri Abhyankar 
4715b2b2dd24SShri Abhyankar         v   +=  bs2;
4716b2b2dd24SShri Abhyankar     }
4717b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4718b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4719b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4720b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4721b2b2dd24SShri Abhyankar 
4722b2b2dd24SShri Abhyankar   }
4723b2b2dd24SShri Abhyankar 
4724b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4725b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4726b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4727b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4728b2b2dd24SShri Abhyankar }
4729b2b2dd24SShri Abhyankar 
4730b2b2dd24SShri Abhyankar #undef __FUNCT__
4731*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
4732*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
47334e2b4712SSatish Balay {
47344e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
47354e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
47366849ba73SBarry Smith   PetscErrorCode    ierr;
47375d0c19d7SBarry Smith   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc;
47385d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4739d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4740d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4741d9fead3dSBarry Smith   const PetscScalar *b;
47424e2b4712SSatish Balay 
47434e2b4712SSatish Balay   PetscFunctionBegin;
4744d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4746f1af5d2fSBarry Smith   t  = a->solve_work;
47474e2b4712SSatish Balay 
47484e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47494e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
47504e2b4712SSatish Balay 
47514e2b4712SSatish Balay   /* forward solve the lower triangular */
47524e2b4712SSatish Balay   idx    = 2*(*r++);
4753f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
47544e2b4712SSatish Balay   for (i=1; i<n; i++) {
47554e2b4712SSatish Balay     v     = aa + 4*ai[i];
47564e2b4712SSatish Balay     vi    = aj + ai[i];
47574e2b4712SSatish Balay     nz    = diag[i] - ai[i];
47584e2b4712SSatish Balay     idx   = 2*(*r++);
4759f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
47604e2b4712SSatish Balay     while (nz--) {
47614e2b4712SSatish Balay       idx   = 2*(*vi++);
4762f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4763f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4764f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
47654e2b4712SSatish Balay       v += 4;
47664e2b4712SSatish Balay     }
47674e2b4712SSatish Balay     idx = 2*i;
4768f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
47694e2b4712SSatish Balay   }
47704e2b4712SSatish Balay   /* backward solve the upper triangular */
47714e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
47724e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
47734e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
47744e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
47754e2b4712SSatish Balay     idt  = 2*i;
4776f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
47774e2b4712SSatish Balay     while (nz--) {
47784e2b4712SSatish Balay       idx   = 2*(*vi++);
4779f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
4780f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4781f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
47824e2b4712SSatish Balay       v += 4;
47834e2b4712SSatish Balay     }
47844e2b4712SSatish Balay     idc = 2*(*c--);
47854e2b4712SSatish Balay     v   = aa + 4*diag[i];
4786f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
4787f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
47884e2b4712SSatish Balay   }
47894e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
47904e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4791d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47921ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4793dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
47944e2b4712SSatish Balay   PetscFunctionReturn(0);
47954e2b4712SSatish Balay }
47964e2b4712SSatish Balay 
47970c4413a7SShri Abhyankar #undef __FUNCT__
4798a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_newdatastruct"
4799a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_newdatastruct(Mat A,Vec bb,Vec xx)
48000c4413a7SShri Abhyankar {
48010c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
48020c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
48030c4413a7SShri Abhyankar   PetscErrorCode    ierr;
48040c4413a7SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,jdx,idt,idc,m;
48050c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
48060c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
48070c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
48080c4413a7SShri Abhyankar   const PetscScalar *b;
48090c4413a7SShri Abhyankar 
48100c4413a7SShri Abhyankar   PetscFunctionBegin;
48110c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
48120c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
48130c4413a7SShri Abhyankar   t  = a->solve_work;
48140c4413a7SShri Abhyankar 
48150c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
48160c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
48170c4413a7SShri Abhyankar 
48180c4413a7SShri Abhyankar   /* forward solve the lower triangular */
48190c4413a7SShri Abhyankar   idx    = 2*r[0];
48200c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
48210c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
48220c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
48230c4413a7SShri Abhyankar     vi    = aj + ai[i];
48240c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
48250c4413a7SShri Abhyankar     idx   = 2*r[i];
48260c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
48270c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
48280c4413a7SShri Abhyankar       jdx   = 2*vi[m];
48290c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
48300c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
48310c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
48320c4413a7SShri Abhyankar       v += 4;
48330c4413a7SShri Abhyankar     }
48340c4413a7SShri Abhyankar     idx = 2*i;
48350c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
48360c4413a7SShri Abhyankar   }
48370c4413a7SShri Abhyankar   /* backward solve the upper triangular */
48380c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
48390c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
48400c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
48410c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
48420c4413a7SShri Abhyankar     idt  = 2*i;
48430c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
48440c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
48450c4413a7SShri Abhyankar       idx   = 2*vi[m];
48460c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
48470c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
48480c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
48490c4413a7SShri Abhyankar       v += 4;
48500c4413a7SShri Abhyankar     }
48510c4413a7SShri Abhyankar     idc = 2*c[i];
48520c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
48530c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
48540c4413a7SShri Abhyankar   }
48550c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48560c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48570c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
48580c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
48590c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
48600c4413a7SShri Abhyankar   PetscFunctionReturn(0);
48610c4413a7SShri Abhyankar }
48628f690400SShri Abhyankar 
486315091d37SBarry Smith /*
486415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
486515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
486615091d37SBarry Smith */
48674a2ae208SSatish Balay #undef __FUNCT__
4868*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
4869*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
487015091d37SBarry Smith {
487115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4872690b6cddSBarry Smith   PetscInt          n=a->mbs,*ai=a->i,*aj=a->j;
4873dfbe8321SBarry Smith   PetscErrorCode    ierr;
4874690b6cddSBarry Smith   PetscInt          *diag = a->diag;
4875d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4876d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
4877d9fead3dSBarry Smith   const PetscScalar *b;
4878690b6cddSBarry Smith   PetscInt          jdx,idt,idx,nz,*vi,i;
487915091d37SBarry Smith 
488015091d37SBarry Smith   PetscFunctionBegin;
4881d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
48821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
488315091d37SBarry Smith 
488415091d37SBarry Smith   /* forward solve the lower triangular */
488515091d37SBarry Smith   idx    = 0;
488615091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
488715091d37SBarry Smith   for (i=1; i<n; i++) {
488815091d37SBarry Smith     v     =  aa      + 4*ai[i];
488915091d37SBarry Smith     vi    =  aj      + ai[i];
489015091d37SBarry Smith     nz    =  diag[i] - ai[i];
489115091d37SBarry Smith     idx   +=  2;
4892f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
489315091d37SBarry Smith     while (nz--) {
489415091d37SBarry Smith       jdx   = 2*(*vi++);
489515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
4896f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4897f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
489815091d37SBarry Smith       v    += 4;
489915091d37SBarry Smith     }
4900f1af5d2fSBarry Smith     x[idx]   = s1;
4901f1af5d2fSBarry Smith     x[1+idx] = s2;
490215091d37SBarry Smith   }
490315091d37SBarry Smith   /* backward solve the upper triangular */
490415091d37SBarry Smith   for (i=n-1; i>=0; i--){
490515091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
490615091d37SBarry Smith     vi   = aj + diag[i] + 1;
490715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
490815091d37SBarry Smith     idt  = 2*i;
4909f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
491015091d37SBarry Smith     while (nz--) {
491115091d37SBarry Smith       idx   = 2*(*vi++);
491215091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
4913f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
4914f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
491515091d37SBarry Smith       v    += 4;
491615091d37SBarry Smith     }
491715091d37SBarry Smith     v        = aa +  4*diag[i];
4918f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
4919f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
492015091d37SBarry Smith   }
492115091d37SBarry Smith 
4922d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
49231ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4924dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
492515091d37SBarry Smith   PetscFunctionReturn(0);
492615091d37SBarry Smith }
492715091d37SBarry Smith 
4928cee9d6f2SShri Abhyankar #undef __FUNCT__
4929a2d6a19aSShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct"
4930a2d6a19aSShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_newdatastruct(Mat A,Vec bb,Vec xx)
4931b2b2dd24SShri Abhyankar {
4932b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4933b2b2dd24SShri Abhyankar     PetscInt          i,k,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag,nz,idx,idt;
4934b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4935b2b2dd24SShri Abhyankar     PetscInt          jdx;
4936b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4937b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
4938b2b2dd24SShri Abhyankar     const PetscScalar *b;
4939b2b2dd24SShri Abhyankar 
4940b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4941b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4942b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4943b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4944b2b2dd24SShri Abhyankar     idx    = 0;
4945b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
4946b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4947b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
4948b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4949b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4950b2b2dd24SShri Abhyankar        idx  = 2*i;
4951b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
4952b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4953b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
4954b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
4955b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
4956b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
4957b2b2dd24SShri Abhyankar            v   +=  4;
4958b2b2dd24SShri Abhyankar         }
4959b2b2dd24SShri Abhyankar        x[idx]   = s1;
4960b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4961b2b2dd24SShri Abhyankar     }
4962b2b2dd24SShri Abhyankar 
4963b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4964b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4965b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
4966b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4967b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4968b2b2dd24SShri Abhyankar      idt = 2*i;
4969b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
4970b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4971b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
4972b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
4973b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
4974b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
4975b2b2dd24SShri Abhyankar          v    += 4;
4976b2b2dd24SShri Abhyankar     }
4977b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4978b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
4979b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
4980b2b2dd24SShri Abhyankar   }
4981b2b2dd24SShri Abhyankar 
4982b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4983b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4984b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
4985b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4986b2b2dd24SShri Abhyankar }
4987b2b2dd24SShri Abhyankar 
4988b2b2dd24SShri Abhyankar #undef __FUNCT__
4989*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
4990*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
49914e2b4712SSatish Balay {
49924e2b4712SSatish Balay   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
49934e2b4712SSatish Balay   IS             iscol=a->col,isrow=a->row;
49946849ba73SBarry Smith   PetscErrorCode ierr;
49955d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz;
49965d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
49973f1db9ecSBarry Smith   MatScalar      *aa=a->a,*v;
499887828ca2SBarry Smith   PetscScalar    *x,*b,s1,*t;
49994e2b4712SSatish Balay 
50004e2b4712SSatish Balay   PetscFunctionBegin;
50014e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
50024e2b4712SSatish Balay 
50031ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
50041ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5005f1af5d2fSBarry Smith   t  = a->solve_work;
50064e2b4712SSatish Balay 
50074e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
50084e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
50094e2b4712SSatish Balay 
50104e2b4712SSatish Balay   /* forward solve the lower triangular */
5011f1af5d2fSBarry Smith   t[0] = b[*r++];
50124e2b4712SSatish Balay   for (i=1; i<n; i++) {
50134e2b4712SSatish Balay     v     = aa + ai[i];
50144e2b4712SSatish Balay     vi    = aj + ai[i];
50154e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5016f1af5d2fSBarry Smith     s1  = b[*r++];
50174e2b4712SSatish Balay     while (nz--) {
5018f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
50194e2b4712SSatish Balay     }
5020f1af5d2fSBarry Smith     t[i] = s1;
50214e2b4712SSatish Balay   }
50224e2b4712SSatish Balay   /* backward solve the upper triangular */
50234e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
50244e2b4712SSatish Balay     v    = aa + diag[i] + 1;
50254e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
50264e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5027f1af5d2fSBarry Smith     s1 = t[i];
50284e2b4712SSatish Balay     while (nz--) {
5029f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
50304e2b4712SSatish Balay     }
5031f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
50324e2b4712SSatish Balay   }
50334e2b4712SSatish Balay 
50344e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
50354e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
50361ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50371ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5038dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
50394e2b4712SSatish Balay   PetscFunctionReturn(0);
50404e2b4712SSatish Balay }
504115091d37SBarry Smith /*
504215091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
504315091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
504415091d37SBarry Smith */
50454a2ae208SSatish Balay #undef __FUNCT__
5046*06e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
5047*06e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
504815091d37SBarry Smith {
504915091d37SBarry Smith   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5050690b6cddSBarry Smith   PetscInt       n=a->mbs,*ai=a->i,*aj=a->j;
5051dfbe8321SBarry Smith   PetscErrorCode ierr;
5052690b6cddSBarry Smith   PetscInt       *diag = a->diag;
505315091d37SBarry Smith   MatScalar      *aa=a->a;
505487828ca2SBarry Smith   PetscScalar    *x,*b;
505587828ca2SBarry Smith   PetscScalar    s1,x1;
505615091d37SBarry Smith   MatScalar      *v;
5057690b6cddSBarry Smith   PetscInt       jdx,idt,idx,nz,*vi,i;
505815091d37SBarry Smith 
505915091d37SBarry Smith   PetscFunctionBegin;
50601ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
50611ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
506215091d37SBarry Smith 
506315091d37SBarry Smith   /* forward solve the lower triangular */
506415091d37SBarry Smith   idx    = 0;
506515091d37SBarry Smith   x[0]   = b[0];
506615091d37SBarry Smith   for (i=1; i<n; i++) {
506715091d37SBarry Smith     v     =  aa      + ai[i];
506815091d37SBarry Smith     vi    =  aj      + ai[i];
506915091d37SBarry Smith     nz    =  diag[i] - ai[i];
507015091d37SBarry Smith     idx   +=  1;
5071f1af5d2fSBarry Smith     s1  =  b[idx];
507215091d37SBarry Smith     while (nz--) {
507315091d37SBarry Smith       jdx   = *vi++;
507415091d37SBarry Smith       x1    = x[jdx];
5075f1af5d2fSBarry Smith       s1 -= v[0]*x1;
507615091d37SBarry Smith       v    += 1;
507715091d37SBarry Smith     }
5078f1af5d2fSBarry Smith     x[idx]   = s1;
507915091d37SBarry Smith   }
508015091d37SBarry Smith   /* backward solve the upper triangular */
508115091d37SBarry Smith   for (i=n-1; i>=0; i--){
508215091d37SBarry Smith     v    = aa + diag[i] + 1;
508315091d37SBarry Smith     vi   = aj + diag[i] + 1;
508415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
508515091d37SBarry Smith     idt  = i;
5086f1af5d2fSBarry Smith     s1 = x[idt];
508715091d37SBarry Smith     while (nz--) {
508815091d37SBarry Smith       idx   = *vi++;
508915091d37SBarry Smith       x1    = x[idx];
5090f1af5d2fSBarry Smith       s1 -= v[0]*x1;
509115091d37SBarry Smith       v    += 1;
509215091d37SBarry Smith     }
509315091d37SBarry Smith     v        = aa +  diag[i];
5094f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
509515091d37SBarry Smith   }
50961ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
50971ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5098dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
509915091d37SBarry Smith   PetscFunctionReturn(0);
510015091d37SBarry Smith }
51014e2b4712SSatish Balay 
51024e2b4712SSatish Balay /* ----------------------------------------------------------------*/
510316a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
51046bce7ff8SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization(Mat,PetscTruth);
5105ae3d28f0SHong Zhang EXTERN PetscErrorCode MatSeqBAIJSetNumericFactorization_newdatastruct(Mat,PetscTruth);
51066bce7ff8SHong Zhang 
51076bce7ff8SHong Zhang #undef __FUNCT__
51086bce7ff8SHong Zhang #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N_newdatastruct"
51096bce7ff8SHong Zhang PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N_newdatastruct(Mat B,Mat A,const MatFactorInfo *info)
51106bce7ff8SHong Zhang {
51116bce7ff8SHong Zhang   Mat            C=B;
51126bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
51136bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
51146bce7ff8SHong Zhang   PetscErrorCode ierr;
51156bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
51166bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
51176bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5118b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5119914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5120914a18a2SHong Zhang   MatScalar      *v_work;
5121ae3d28f0SHong Zhang   PetscTruth     col_identity,row_identity,both_identity;
51226bce7ff8SHong Zhang 
51236bce7ff8SHong Zhang   PetscFunctionBegin;
51246bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
51256bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5126ae3d28f0SHong Zhang 
5127fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5128fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
51296bce7ff8SHong Zhang   ics  = ic;
51306bce7ff8SHong Zhang 
5131914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5132fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5133914a18a2SHong Zhang 
51346bce7ff8SHong Zhang   for (i=0; i<n; i++){
51356bce7ff8SHong Zhang     /* zero rtmp */
51366bce7ff8SHong Zhang     /* L part */
51376bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
51386bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5139914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5140914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5141914a18a2SHong Zhang     }
51426bce7ff8SHong Zhang 
51436bce7ff8SHong Zhang     /* U part */
51441a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
51451a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
51461a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
51471a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51481a83e813SShri Abhyankar     }
51491a83e813SShri Abhyankar 
51501a83e813SShri Abhyankar     /* load in initial (unfactored row) */
51511a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
51521a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
51531a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
51541a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
51551a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
51561a83e813SShri Abhyankar     }
51571a83e813SShri Abhyankar 
51581a83e813SShri Abhyankar     /* elimination */
51591a83e813SShri Abhyankar     bjtmp = bj + bi[i];
51601a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
51611a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
51621a83e813SShri Abhyankar       row = bjtmp[k];
51631a83e813SShri Abhyankar       pc = rtmp + bs2*row;
51641a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
51651a83e813SShri Abhyankar       if (flg) {
51661a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
51671a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
51681a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
51691a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
51701a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
51711a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
51721a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
51731a83e813SShri Abhyankar         }
51741a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
51751a83e813SShri Abhyankar       }
51761a83e813SShri Abhyankar     }
51771a83e813SShri Abhyankar 
51781a83e813SShri Abhyankar     /* finished row so stick it into b->a */
51791a83e813SShri Abhyankar     /* L part */
51801a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
51811a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
51821a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
51831a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
51841a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51851a83e813SShri Abhyankar     }
51861a83e813SShri Abhyankar 
51871a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
51881a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
51891a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
51901a83e813SShri Abhyankar     /* if (*pj != i)SETERRQ2(PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
51911a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
51921a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
51931a83e813SShri Abhyankar 
51941a83e813SShri Abhyankar     /* U part */
51951a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
51961a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
51971a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
51981a83e813SShri Abhyankar     for (j=0; j<nz; j++){
51991a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
52001a83e813SShri Abhyankar     }
52011a83e813SShri Abhyankar   }
52021a83e813SShri Abhyankar 
52031a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5204fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
52051a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
52061a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
52071a83e813SShri Abhyankar 
5208ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5209ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5210ae3d28f0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
5211ae3d28f0SHong Zhang   if (both_identity){
5212a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering_newdatastruct;
5213ae3d28f0SHong Zhang   } else {
5214a2d6a19aSShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_newdatastruct;
5215ae3d28f0SHong Zhang   }
52168499736aSShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N_newdatastruct;
5217ae3d28f0SHong Zhang 
52181a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
52191a83e813SShri Abhyankar   ierr = PetscLogFlops(1.3333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
52201a83e813SShri Abhyankar   PetscFunctionReturn(0);
52211a83e813SShri Abhyankar }
52221a83e813SShri Abhyankar 
52236bce7ff8SHong Zhang /*
52246bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
522516a2bf60SHong Zhang    See MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct() for detailed description
522616a2bf60SHong Zhang    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_newdatastruct().
52276bce7ff8SHong Zhang */
5228c0c7eb62SShri Abhyankar 
52296bce7ff8SHong Zhang #undef __FUNCT__
52306bce7ff8SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct"
52316bce7ff8SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
52326bce7ff8SHong Zhang {
52336bce7ff8SHong Zhang 
52346bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
52356bce7ff8SHong Zhang   PetscErrorCode     ierr;
523616a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
523735aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
523835aa4fcfSShri Abhyankar 
523935aa4fcfSShri Abhyankar   PetscFunctionBegin;
524035aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
524135aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
524235aa4fcfSShri Abhyankar 
524335aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
524435aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
524535aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
524635aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
524735aa4fcfSShri Abhyankar   if (!b->diag){
524835aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
524935aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
525035aa4fcfSShri Abhyankar   }
525135aa4fcfSShri Abhyankar   bdiag = b->diag;
525235aa4fcfSShri Abhyankar 
525335aa4fcfSShri Abhyankar   if (n > 0) {
525435aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
525535aa4fcfSShri Abhyankar   }
525635aa4fcfSShri Abhyankar 
525735aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
525835aa4fcfSShri Abhyankar   bi = b->i;
525935aa4fcfSShri Abhyankar   bj = b->j;
526035aa4fcfSShri Abhyankar 
526135aa4fcfSShri Abhyankar   /* L part */
526235aa4fcfSShri Abhyankar   bi[0] = 0;
526335aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
526435aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
526535aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
526635aa4fcfSShri Abhyankar     aj = a->j + ai[i];
526735aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
526835aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
526935aa4fcfSShri Abhyankar     }
527035aa4fcfSShri Abhyankar   }
527135aa4fcfSShri Abhyankar 
527235aa4fcfSShri Abhyankar   /* U part */
527335aa4fcfSShri Abhyankar   bi_temp = bi[n];
527435aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
527535aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
527635aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
527735aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
527835aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
527935aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
528035aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
528135aa4fcfSShri Abhyankar     }
528235aa4fcfSShri Abhyankar     /* diag[i] */
528335aa4fcfSShri Abhyankar     *bj = i; bj++;
528435aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
528535aa4fcfSShri Abhyankar   }
528635aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
528735aa4fcfSShri Abhyankar }
528835aa4fcfSShri Abhyankar 
528935aa4fcfSShri Abhyankar #undef __FUNCT__
529016a2bf60SHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_newdatastruct"
529116a2bf60SHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_newdatastruct(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
529216a2bf60SHong Zhang {
529316a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
529416a2bf60SHong Zhang   IS                 isicol;
529516a2bf60SHong Zhang   PetscErrorCode     ierr;
529616a2bf60SHong Zhang   const PetscInt     *r,*ic;
52977fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
529816a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
529916a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
530016a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
53017fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
530216a2bf60SHong Zhang   PetscReal          f;
530316a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
530416a2bf60SHong Zhang   PetscBT            lnkbt;
530516a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
530616a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
530716a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
530816a2bf60SHong Zhang   PetscTruth         missing;
53097fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
5310*06e38f1dSHong Zhang   PetscTruth         newdatastruct = PETSC_FALSE;
531116a2bf60SHong Zhang 
531216a2bf60SHong Zhang   PetscFunctionBegin;
5313*06e38f1dSHong Zhang   ierr = PetscOptionsGetTruth(PETSC_NULL,"-ilu_old",&newdatastruct,PETSC_NULL);CHKERRQ(ierr);
5314*06e38f1dSHong Zhang   if (newdatastruct){
5315*06e38f1dSHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_inplace(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5316*06e38f1dSHong Zhang     PetscFunctionReturn(0);
5317*06e38f1dSHong Zhang   }
531816a2bf60SHong Zhang   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
531916a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
532016a2bf60SHong Zhang   if (missing) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
532116a2bf60SHong Zhang 
532216a2bf60SHong Zhang   f             = info->fill;
532316a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
532416a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
532516a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
532616a2bf60SHong Zhang 
532716a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
532816a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
53297fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
533016a2bf60SHong Zhang 
53317fa3a6a0SHong Zhang   if (!levels && both_identity) {
533216a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
533316a2bf60SHong Zhang     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0_newdatastruct(fact,A,isrow,iscol,info);CHKERRQ(ierr);
5334ae3d28f0SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
533535aa4fcfSShri Abhyankar 
533635aa4fcfSShri Abhyankar     fact->factor = MAT_FACTOR_ILU;
533735aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
533835aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
533935aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
534035aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
534135aa4fcfSShri Abhyankar     b->row           = isrow;
534235aa4fcfSShri Abhyankar     b->col           = iscol;
534335aa4fcfSShri Abhyankar     b->icol          = isicol;
534435aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
534535aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
534635aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
534735aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
534835aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
534935aa4fcfSShri Abhyankar   }
535035aa4fcfSShri Abhyankar 
535135aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
535235aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
535335aa4fcfSShri Abhyankar 
535435aa4fcfSShri Abhyankar   /* get new row pointers */
535535aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
535635aa4fcfSShri Abhyankar   bi[0] = 0;
535735aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
535835aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
535935aa4fcfSShri Abhyankar   bdiag[0]  = 0;
536035aa4fcfSShri Abhyankar 
5361fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
536235aa4fcfSShri Abhyankar 
536335aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
536435aa4fcfSShri Abhyankar   nlnk = n + 1;
536535aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
536635aa4fcfSShri Abhyankar 
536735aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
536835aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
536935aa4fcfSShri Abhyankar   current_space = free_space;
537035aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
537135aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
537235aa4fcfSShri Abhyankar 
537335aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
537435aa4fcfSShri Abhyankar     nzi = 0;
537535aa4fcfSShri Abhyankar     /* copy current row into linked list */
537635aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
537735aa4fcfSShri Abhyankar     if (!nnz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
537835aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
537935aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
538035aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
538135aa4fcfSShri Abhyankar     nzi += nlnk;
538235aa4fcfSShri Abhyankar 
538335aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
538435aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
538535aa4fcfSShri Abhyankar       fm = n;
538635aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
538735aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
538835aa4fcfSShri Abhyankar       lnk[fm]    = i;
538935aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
539035aa4fcfSShri Abhyankar       nzi++; dcount++;
539135aa4fcfSShri Abhyankar     }
539235aa4fcfSShri Abhyankar 
539335aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
539435aa4fcfSShri Abhyankar     nzbd = 0;
539535aa4fcfSShri Abhyankar     prow = lnk[n];
539635aa4fcfSShri Abhyankar     while (prow < i) {
539735aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
539835aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
539935aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
540035aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
540135aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
540235aa4fcfSShri Abhyankar       nzi += nlnk;
540335aa4fcfSShri Abhyankar       prow = lnk[prow];
540435aa4fcfSShri Abhyankar       nzbd++;
540535aa4fcfSShri Abhyankar     }
540635aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
540735aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
540835aa4fcfSShri Abhyankar 
540935aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
541035aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
541135aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
541235aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
541335aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
541435aa4fcfSShri Abhyankar       reallocs++;
541535aa4fcfSShri Abhyankar     }
541635aa4fcfSShri Abhyankar 
541735aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
541835aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
541935aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
542035aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
542135aa4fcfSShri Abhyankar 
542235aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
542335aa4fcfSShri Abhyankar     if (*(bj_ptr[i]+bdiag[i]) != i) {
542435aa4fcfSShri Abhyankar       SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
542535aa4fcfSShri Abhyankar     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
542635aa4fcfSShri Abhyankar     }
542735aa4fcfSShri Abhyankar 
542835aa4fcfSShri Abhyankar     current_space->array           += nzi;
542935aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
543035aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
543135aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
543235aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
543335aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
543435aa4fcfSShri Abhyankar   }
543535aa4fcfSShri Abhyankar 
543635aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
543735aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
543835aa4fcfSShri Abhyankar 
543935aa4fcfSShri Abhyankar   /* destroy list of free space and other temporary arrays */
544035aa4fcfSShri Abhyankar   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
544135aa4fcfSShri Abhyankar 
544235aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
544335aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceContiguous_LU_v2(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
544435aa4fcfSShri Abhyankar 
544535aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
544635aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5447fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
544835aa4fcfSShri Abhyankar 
544935aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
545035aa4fcfSShri Abhyankar   {
545135aa4fcfSShri Abhyankar     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
545235aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
545335aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
545435aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
545535aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
545635aa4fcfSShri Abhyankar     if (diagonal_fill) {
545735aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
545835aa4fcfSShri Abhyankar     }
545935aa4fcfSShri Abhyankar   }
546035aa4fcfSShri Abhyankar #endif
546135aa4fcfSShri Abhyankar 
546235aa4fcfSShri Abhyankar   /* put together the new matrix */
546335aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
546435aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
546535aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
546635aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
546735aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
546835aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
546935aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
547035aa4fcfSShri Abhyankar   b->j          = bj;
547135aa4fcfSShri Abhyankar   b->i          = bi;
547235aa4fcfSShri Abhyankar   b->diag       = bdiag;
547335aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
547435aa4fcfSShri Abhyankar   b->ilen       = 0;
547535aa4fcfSShri Abhyankar   b->imax       = 0;
547635aa4fcfSShri Abhyankar   b->row        = isrow;
547735aa4fcfSShri Abhyankar   b->col        = iscol;
547835aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
547935aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
548035aa4fcfSShri Abhyankar   b->icol       = isicol;
548135aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
548235aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
548335aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
548435aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
548535aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
5486ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
5487ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
5488ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
5489ae3d28f0SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_newdatastruct(fact,both_identity);CHKERRQ(ierr);
549035aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
549135aa4fcfSShri Abhyankar }
549235aa4fcfSShri Abhyankar 
549335aa4fcfSShri Abhyankar 
54944e2b4712SSatish Balay /*
54954e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
54964e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
54974e2b4712SSatish Balay    Not a good example of code reuse.
54984e2b4712SSatish Balay */
54994a2ae208SSatish Balay #undef __FUNCT__
5500*06e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
5501*06e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
55024e2b4712SSatish Balay {
55034e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
55044e2b4712SSatish Balay   IS             isicol;
55056849ba73SBarry Smith   PetscErrorCode ierr;
55065d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
55075d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5508a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5509d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
551041df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
5511329f5518SBarry Smith   PetscReal      f;
55124e2b4712SSatish Balay 
55134e2b4712SSatish Balay   PetscFunctionBegin;
55146bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
55156bce7ff8SHong Zhang   if (flg) SETERRQ1(PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
55166bce7ff8SHong Zhang 
5517435faa5fSBarry Smith   f             = info->fill;
5518690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
5519690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
55204c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
552116a2bf60SHong Zhang 
5522667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5523667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
55247d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
5525309c388cSBarry Smith 
552641df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
552716a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
55286bce7ff8SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
55296bce7ff8SHong Zhang 
5530719d5645SBarry Smith     fact->factor = MAT_FACTOR_ILU;
5531ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
5532bb3d539aSBarry Smith     b->row       = isrow;
5533bb3d539aSBarry Smith     b->col       = iscol;
5534bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5535bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5536bb3d539aSBarry Smith     b->icol      = isicol;
5537bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5538b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
55396bce7ff8SHong Zhang     PetscFunctionReturn(0);
55406bce7ff8SHong Zhang   }
55416bce7ff8SHong Zhang 
55426bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
55434e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
55444e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
55454e2b4712SSatish Balay 
55464e2b4712SSatish Balay     /* get new row pointers */
5547690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
55484e2b4712SSatish Balay     ainew[0] = 0;
55494e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
5550690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
5551690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
55524e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
5553690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
55544e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
5555690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
55564e2b4712SSatish Balay     /* im is level for each filled value */
5557690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
55584e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
5559690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
55604e2b4712SSatish Balay     dloc[0]  = 0;
55614e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
5562435faa5fSBarry Smith 
5563435faa5fSBarry Smith       /* copy prow into linked list */
55644e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
55653b4a8b6dSBarry Smith       if (!nz) SETERRQ2(PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
55664e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
55674e2b4712SSatish Balay       fill[n]    = n;
5568435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
55694e2b4712SSatish Balay       while (nz--) {
55704e2b4712SSatish Balay 	fm  = n;
55714e2b4712SSatish Balay 	idx = ic[*xi++];
55724e2b4712SSatish Balay 	do {
55734e2b4712SSatish Balay 	  m  = fm;
55744e2b4712SSatish Balay 	  fm = fill[m];
55754e2b4712SSatish Balay 	} while (fm < idx);
55764e2b4712SSatish Balay 	fill[m]   = idx;
55774e2b4712SSatish Balay 	fill[idx] = fm;
55784e2b4712SSatish Balay 	im[idx]   = 0;
55794e2b4712SSatish Balay       }
5580435faa5fSBarry Smith 
5581435faa5fSBarry Smith       /* make sure diagonal entry is included */
5582435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
5583435faa5fSBarry Smith 	fm = n;
5584435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
5585435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
5586435faa5fSBarry Smith 	fill[fm]   = prow;
5587435faa5fSBarry Smith 	im[prow]   = 0;
5588435faa5fSBarry Smith 	nzf++;
5589335d9088SBarry Smith 	dcount++;
5590435faa5fSBarry Smith       }
5591435faa5fSBarry Smith 
55924e2b4712SSatish Balay       nzi = 0;
55934e2b4712SSatish Balay       row = fill[n];
55944e2b4712SSatish Balay       while (row < prow) {
55954e2b4712SSatish Balay 	incrlev = im[row] + 1;
55964e2b4712SSatish Balay 	nz      = dloc[row];
5597435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
55984e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
55994e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
56004e2b4712SSatish Balay 	fm      = row;
56014e2b4712SSatish Balay 	while (nnz-- > 0) {
56024e2b4712SSatish Balay 	  idx = *xi++;
56034e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
56044e2b4712SSatish Balay 	    flev++;
56054e2b4712SSatish Balay 	    continue;
56064e2b4712SSatish Balay 	  }
56074e2b4712SSatish Balay 	  do {
56084e2b4712SSatish Balay 	    m  = fm;
56094e2b4712SSatish Balay 	    fm = fill[m];
56104e2b4712SSatish Balay 	  } while (fm < idx);
56114e2b4712SSatish Balay 	  if (fm != idx) {
56124e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
56134e2b4712SSatish Balay 	    fill[m]   = idx;
56144e2b4712SSatish Balay 	    fill[idx] = fm;
56154e2b4712SSatish Balay 	    fm        = idx;
56164e2b4712SSatish Balay 	    nzf++;
5617ecf371e4SBarry Smith 	  } else {
56184e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
56194e2b4712SSatish Balay 	  }
56204e2b4712SSatish Balay 	  flev++;
56214e2b4712SSatish Balay 	}
56224e2b4712SSatish Balay 	row = fill[row];
56234e2b4712SSatish Balay 	nzi++;
56244e2b4712SSatish Balay       }
56254e2b4712SSatish Balay       /* copy new filled row into permanent storage */
56264e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
56274e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
5628ecf371e4SBarry Smith 
5629ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
5630ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
5631ecf371e4SBarry Smith 	/* just double the memory each time */
5632690b6cddSBarry Smith 	PetscInt maxadd = jmax;
5633ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
56344e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
56354e2b4712SSatish Balay 	jmax += maxadd;
5636ecf371e4SBarry Smith 
5637ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
56385d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
56395d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5640606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
56415d0c19d7SBarry Smith 	ajnew = xitmp;
56425d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
56435d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
5644606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
56455d0c19d7SBarry Smith 	ajfill = xitmp;
5646eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
56474e2b4712SSatish Balay       }
56485d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
56494e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
56504e2b4712SSatish Balay       dloc[prow]  = nzi;
56514e2b4712SSatish Balay       fm          = fill[n];
56524e2b4712SSatish Balay       while (nzf--) {
56535d0c19d7SBarry Smith 	*xitmp++ = fm;
56544e2b4712SSatish Balay 	*flev++ = im[fm];
56554e2b4712SSatish Balay 	fm      = fill[fm];
56564e2b4712SSatish Balay       }
5657435faa5fSBarry Smith       /* make sure row has diagonal entry */
5658435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
565977431f27SBarry Smith 	SETERRQ1(PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
56602401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
5661435faa5fSBarry Smith       }
56624e2b4712SSatish Balay     }
5663606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
56644e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
56654e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
5666606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
5667606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
56684e2b4712SSatish Balay 
56696cf91177SBarry Smith #if defined(PETSC_USE_INFO)
56704e2b4712SSatish Balay     {
5671329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
5672ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
5673ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
5674ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
5675ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
5676335d9088SBarry Smith       if (diagonal_fill) {
5677ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
5678335d9088SBarry Smith       }
56794e2b4712SSatish Balay     }
568063ba0a88SBarry Smith #endif
56814e2b4712SSatish Balay 
56824e2b4712SSatish Balay     /* put together the new matrix */
5683719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
5684719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
5685ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
5686e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
5687e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
56887c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
5689a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
56904e2b4712SSatish Balay     b->j          = ajnew;
56914e2b4712SSatish Balay     b->i          = ainew;
56924e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
56934e2b4712SSatish Balay     b->diag       = dloc;
56947f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
56954e2b4712SSatish Balay     b->ilen       = 0;
56964e2b4712SSatish Balay     b->imax       = 0;
56974e2b4712SSatish Balay     b->row        = isrow;
56984e2b4712SSatish Balay     b->col        = iscol;
5699bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5700c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5701c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5702e51c0b9cSSatish Balay     b->icol       = isicol;
570387828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
57044e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
57054e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
5706719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
57074e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
57084e2b4712SSatish Balay 
5709ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
5710ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
5711ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
57126bce7ff8SHong Zhang 
571341df41f0SMatthew Knepley   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
57148661488fSKris Buschelman   PetscFunctionReturn(0);
57158661488fSKris Buschelman }
57168661488fSKris Buschelman 
5717732ee342SKris Buschelman #undef __FUNCT__
57187e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
5719dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
57207e7071cdSKris Buschelman {
572112272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
572212272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
57235a9542e3SKris Buschelman   PetscFunctionBegin;
57247cf1b8d3SKris Buschelman   /* Undo Column scaling */
57257cf1b8d3SKris Buschelman /*    while (nz--) { */
57267cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
57277cf1b8d3SKris Buschelman /*    } */
5728c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
5729c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
57307cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
57317cf1b8d3SKris Buschelman }
57327cf1b8d3SKris Buschelman 
57337cf1b8d3SKris Buschelman #undef __FUNCT__
57347cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
5735dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
57367cf1b8d3SKris Buschelman {
57377cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
5738b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
57392aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
57405a9542e3SKris Buschelman   PetscFunctionBegin;
57410b9da03eSKris Buschelman   /* Is this really necessary? */
574220235379SKris Buschelman   while (nz--) {
57430b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
57447e7071cdSKris Buschelman   }
5745c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
57467e7071cdSKris Buschelman   PetscFunctionReturn(0);
57477e7071cdSKris Buschelman }
57487e7071cdSKris Buschelman 
5749732ee342SKris Buschelman 
5750