xref: /petsc/src/mat/impls/baij/seq/baijfact2.c (revision 6ba06ab77c48d2bbbcba7e829ffcd04868efd3d5)
1be1d678aSKris Buschelman #define PETSCMAT_DLL
2be1d678aSKris Buschelman 
34e2b4712SSatish Balay /*
44e2b4712SSatish Balay     Factorization code for BAIJ format.
54e2b4712SSatish Balay */
64e2b4712SSatish Balay 
77c4f633dSBarry Smith #include "../src/mat/impls/baij/seq/baij.h"
8c60f0209SBarry Smith #include "../src/mat/blockinvert.h"
916a2bf60SHong Zhang #include "petscbt.h"
1016a2bf60SHong Zhang #include "../src/mat/utils/freespace.h"
114e2b4712SSatish Balay 
124a2ae208SSatish Balay #undef __FUNCT__
1306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace"
1406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
15f1af5d2fSBarry Smith {
16f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
17dfbe8321SBarry Smith   PetscErrorCode    ierr;
180b68f018SBarry Smith   PetscInt          i,nz;
190b68f018SBarry Smith   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
200b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
210b68f018SBarry Smith   PetscScalar       s1,*x;
220b68f018SBarry Smith   const PetscScalar *b;
23f1af5d2fSBarry Smith 
24f1af5d2fSBarry Smith   PetscFunctionBegin;
25ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
260b68f018SBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
28f1af5d2fSBarry Smith 
29f1af5d2fSBarry Smith   /* forward solve the U^T */
30f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
31f1af5d2fSBarry Smith 
32f1af5d2fSBarry Smith     v     = aa + diag[i];
33f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
34ef66eb69SBarry Smith     s1    = (*v++)*x[i];
35f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
36f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
37f1af5d2fSBarry Smith     while (nz--) {
38f1af5d2fSBarry Smith       x[*vi++]  -= (*v++)*s1;
39f1af5d2fSBarry Smith     }
40f1af5d2fSBarry Smith     x[i]   = s1;
41f1af5d2fSBarry Smith   }
42f1af5d2fSBarry Smith   /* backward solve the L^T */
43f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
44f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
45f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
46f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
47f1af5d2fSBarry Smith     s1   = x[i];
48f1af5d2fSBarry Smith     while (nz--) {
49f1af5d2fSBarry Smith       x[*vi--]   -=  (*v--)*s1;
50f1af5d2fSBarry Smith     }
51f1af5d2fSBarry Smith   }
520b68f018SBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
531ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
54dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
55f1af5d2fSBarry Smith   PetscFunctionReturn(0);
56f1af5d2fSBarry Smith }
57f1af5d2fSBarry Smith 
584a2ae208SSatish Balay #undef __FUNCT__
5906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace"
6006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
61f1af5d2fSBarry Smith {
62f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
63dfbe8321SBarry Smith   PetscErrorCode    ierr;
64b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
65b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
66b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
67b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
68b3260449SShri Abhyankar   const PetscScalar *b;
69f1af5d2fSBarry Smith 
70f1af5d2fSBarry Smith   PetscFunctionBegin;
71ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
72b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
731ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
74f1af5d2fSBarry Smith 
75f1af5d2fSBarry Smith   /* forward solve the U^T */
76f1af5d2fSBarry Smith   idx = 0;
77f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
78f1af5d2fSBarry Smith 
79f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
80f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
81ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx];
82f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
83f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
84f1af5d2fSBarry Smith     v += 4;
85f1af5d2fSBarry Smith 
86f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
87f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
88f1af5d2fSBarry Smith     while (nz--) {
89f1af5d2fSBarry Smith       oidx = 2*(*vi++);
90f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2;
91f1af5d2fSBarry Smith       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
92f1af5d2fSBarry Smith       v  += 4;
93f1af5d2fSBarry Smith     }
94f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2;
95f1af5d2fSBarry Smith     idx += 2;
96f1af5d2fSBarry Smith   }
97f1af5d2fSBarry Smith   /* backward solve the L^T */
98f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
99f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
100f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
101f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
102f1af5d2fSBarry Smith     idt  = 2*i;
103f1af5d2fSBarry Smith     s1   = x[idt];  s2 = x[1+idt];
104f1af5d2fSBarry Smith     while (nz--) {
105f1af5d2fSBarry Smith       idx   = 2*(*vi--);
106f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2;
107f1af5d2fSBarry Smith       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
108f1af5d2fSBarry Smith       v -= 4;
109f1af5d2fSBarry Smith     }
110f1af5d2fSBarry Smith   }
111b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
113dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4.0*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
114f1af5d2fSBarry Smith   PetscFunctionReturn(0);
115f1af5d2fSBarry Smith }
116f1af5d2fSBarry Smith 
1174a2ae208SSatish Balay #undef __FUNCT__
1184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_NaturalOrdering"
1194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
1206929473cSShri Abhyankar {
1216929473cSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1226929473cSShri Abhyankar   PetscErrorCode    ierr;
123b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
1246929473cSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
125b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
126b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
127b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x;
128b3260449SShri Abhyankar   const PetscScalar *b;
1296929473cSShri Abhyankar 
1306929473cSShri Abhyankar   PetscFunctionBegin;
1316929473cSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
132b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1336929473cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1346929473cSShri Abhyankar 
1356929473cSShri Abhyankar   /* forward solve the U^T */
1366929473cSShri Abhyankar   idx = 0;
1376929473cSShri Abhyankar   for (i=0; i<n; i++) {
1386929473cSShri Abhyankar     v     = aa + bs2*diag[i];
1396929473cSShri Abhyankar     /* multiply by the inverse of the block diagonal */
1406929473cSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];
1416929473cSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
1426929473cSShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
1436929473cSShri Abhyankar     v -= bs2;
1446929473cSShri Abhyankar 
1456929473cSShri Abhyankar     vi    = aj + diag[i] - 1;
1466929473cSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
1476929473cSShri Abhyankar     for(j=0;j>-nz;j--){
1486929473cSShri Abhyankar       oidx = bs*vi[j];
1496929473cSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2;
1506929473cSShri Abhyankar       x[oidx+1] -= v[2]*s1  +  v[3]*s2;
1516929473cSShri Abhyankar       v  -= bs2;
1526929473cSShri Abhyankar     }
1536929473cSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;
1546929473cSShri Abhyankar     idx += bs;
1556929473cSShri Abhyankar   }
1566929473cSShri Abhyankar   /* backward solve the L^T */
1576929473cSShri Abhyankar   for (i=n-1; i>=0; i--){
1586929473cSShri Abhyankar     v    = aa + bs2*ai[i];
1596929473cSShri Abhyankar     vi   = aj + ai[i];
1606929473cSShri Abhyankar     nz   = ai[i+1] - ai[i];
1616929473cSShri Abhyankar     idt  = bs*i;
1626929473cSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];
1636929473cSShri Abhyankar     for(j=0;j<nz;j++){
1646929473cSShri Abhyankar       idx   = bs*vi[j];
1656929473cSShri Abhyankar       x[idx]   -=  v[0]*s1 +  v[1]*s2;
1666929473cSShri Abhyankar       x[idx+1] -=  v[2]*s1 +  v[3]*s2;
1676929473cSShri Abhyankar       v += bs2;
1686929473cSShri Abhyankar     }
1696929473cSShri Abhyankar   }
170b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1716929473cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1726929473cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
1736929473cSShri Abhyankar   PetscFunctionReturn(0);
1746929473cSShri Abhyankar }
1756929473cSShri Abhyankar 
1766929473cSShri Abhyankar #undef __FUNCT__
17706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace"
17806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
179f1af5d2fSBarry Smith {
180f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
181dfbe8321SBarry Smith   PetscErrorCode    ierr;
182b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
183b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
184b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
185b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
186b3260449SShri Abhyankar   const PetscScalar *b;
187f1af5d2fSBarry Smith 
188f1af5d2fSBarry Smith   PetscFunctionBegin;
189ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
190b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
1911ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
192f1af5d2fSBarry Smith 
193f1af5d2fSBarry Smith   /* forward solve the U^T */
194f1af5d2fSBarry Smith   idx = 0;
195f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
196f1af5d2fSBarry Smith 
197f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
198f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
199ef66eb69SBarry Smith     x1 = x[idx];   x2 = x[1+idx]; x3    = x[2+idx];
200f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
201f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
202f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
203f1af5d2fSBarry Smith     v += 9;
204f1af5d2fSBarry Smith 
205f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
206f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
207f1af5d2fSBarry Smith     while (nz--) {
208f1af5d2fSBarry Smith       oidx = 3*(*vi++);
209f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
210f1af5d2fSBarry Smith       x[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
211f1af5d2fSBarry Smith       x[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
212f1af5d2fSBarry Smith       v  += 9;
213f1af5d2fSBarry Smith     }
214f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;
215f1af5d2fSBarry Smith     idx += 3;
216f1af5d2fSBarry Smith   }
217f1af5d2fSBarry Smith   /* backward solve the L^T */
218f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
219f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
220f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
221f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
222f1af5d2fSBarry Smith     idt  = 3*i;
223f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];
224f1af5d2fSBarry Smith     while (nz--) {
225f1af5d2fSBarry Smith       idx   = 3*(*vi--);
226f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
227f1af5d2fSBarry Smith       x[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
228f1af5d2fSBarry Smith       x[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
229f1af5d2fSBarry Smith       v -= 9;
230f1af5d2fSBarry Smith     }
231f1af5d2fSBarry Smith   }
232b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2331ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
234dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9.0*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
235f1af5d2fSBarry Smith   PetscFunctionReturn(0);
236f1af5d2fSBarry Smith }
237f1af5d2fSBarry Smith 
2384a2ae208SSatish Balay #undef __FUNCT__
2394dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_NaturalOrdering"
2404dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
2418499736aSShri Abhyankar {
2428499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
2438499736aSShri Abhyankar   PetscErrorCode    ierr;
244b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
2458499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
246b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
247b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
248b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x;
249b3260449SShri Abhyankar   const PetscScalar *b;
2508499736aSShri Abhyankar 
2518499736aSShri Abhyankar   PetscFunctionBegin;
2528499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
253b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2548499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2558499736aSShri Abhyankar 
2568499736aSShri Abhyankar   /* forward solve the U^T */
2578499736aSShri Abhyankar   idx = 0;
2588499736aSShri Abhyankar   for (i=0; i<n; i++) {
2598499736aSShri Abhyankar     v     = aa + bs2*diag[i];
2608499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
2618499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];
2628499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2  + v[2]*x3;
2638499736aSShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2  + v[5]*x3;
2648499736aSShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2  + v[8]*x3;
2658499736aSShri Abhyankar     v -= bs2;
2668499736aSShri Abhyankar 
2678499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
2688499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
2698499736aSShri Abhyankar     for(j=0;j>-nz;j--){
2708499736aSShri Abhyankar       oidx = bs*vi[j];
2718499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
2728499736aSShri Abhyankar       x[oidx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
2738499736aSShri Abhyankar       x[oidx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
2748499736aSShri Abhyankar       v  -= bs2;
2758499736aSShri Abhyankar     }
2768499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;
2778499736aSShri Abhyankar     idx += bs;
2788499736aSShri Abhyankar   }
2798499736aSShri Abhyankar   /* backward solve the L^T */
2808499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
2818499736aSShri Abhyankar     v    = aa + bs2*ai[i];
2828499736aSShri Abhyankar     vi   = aj + ai[i];
2838499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
2848499736aSShri Abhyankar     idt  = bs*i;
2858499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];
2868499736aSShri Abhyankar     for(j=0;j<nz;j++){
2878499736aSShri Abhyankar       idx   = bs*vi[j];
2888499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2  + v[2]*s3;
2898499736aSShri Abhyankar       x[idx+1] -= v[3]*s1  +  v[4]*s2  + v[5]*s3;
2908499736aSShri Abhyankar       x[idx+2] -= v[6]*s1  +  v[7]*s2  + v[8]*s3;
2918499736aSShri Abhyankar       v += bs2;
2928499736aSShri Abhyankar     }
2938499736aSShri Abhyankar   }
294b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
2958499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2968499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
2978499736aSShri Abhyankar   PetscFunctionReturn(0);
2988499736aSShri Abhyankar }
2998499736aSShri Abhyankar 
3008499736aSShri Abhyankar #undef __FUNCT__
30106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace"
30206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
303f1af5d2fSBarry Smith {
304f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
305dfbe8321SBarry Smith   PetscErrorCode    ierr;
306b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
307b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
308b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
309b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
310b3260449SShri Abhyankar   const PetscScalar *b;
311f1af5d2fSBarry Smith 
312f1af5d2fSBarry Smith   PetscFunctionBegin;
313ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
314b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3151ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316f1af5d2fSBarry Smith 
317f1af5d2fSBarry Smith   /* forward solve the U^T */
318f1af5d2fSBarry Smith   idx = 0;
319f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
320f1af5d2fSBarry Smith 
321f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
322f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
323ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx];
324f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
325f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
326f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
327f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
328f1af5d2fSBarry Smith     v += 16;
329f1af5d2fSBarry Smith 
330f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
331f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
332f1af5d2fSBarry Smith     while (nz--) {
333f1af5d2fSBarry Smith       oidx = 4*(*vi++);
334f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
335f1af5d2fSBarry Smith       x[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
336f1af5d2fSBarry Smith       x[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
337f1af5d2fSBarry Smith       x[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
338f1af5d2fSBarry Smith       v  += 16;
339f1af5d2fSBarry Smith     }
340f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4;
341f1af5d2fSBarry Smith     idx += 4;
342f1af5d2fSBarry Smith   }
343f1af5d2fSBarry Smith   /* backward solve the L^T */
344f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
345f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
346f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
347f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
348f1af5d2fSBarry Smith     idt  = 4*i;
349f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt];
350f1af5d2fSBarry Smith     while (nz--) {
351f1af5d2fSBarry Smith       idx   = 4*(*vi--);
352f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
353f1af5d2fSBarry Smith       x[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
354f1af5d2fSBarry Smith       x[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
355f1af5d2fSBarry Smith       x[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
356f1af5d2fSBarry Smith       v -= 16;
357f1af5d2fSBarry Smith     }
358f1af5d2fSBarry Smith   }
359b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3601ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
361dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
362f1af5d2fSBarry Smith   PetscFunctionReturn(0);
363f1af5d2fSBarry Smith }
364f1af5d2fSBarry Smith 
3654a2ae208SSatish Balay #undef __FUNCT__
3664dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_NaturalOrdering"
3674dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
3688499736aSShri Abhyankar {
3698499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
3708499736aSShri Abhyankar   PetscErrorCode    ierr;
371b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
3728499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
373b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
374b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
375b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x;
376b3260449SShri Abhyankar   const PetscScalar *b;
3778499736aSShri Abhyankar 
3788499736aSShri Abhyankar   PetscFunctionBegin;
3798499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
380b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
3818499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3828499736aSShri Abhyankar 
3838499736aSShri Abhyankar   /* forward solve the U^T */
3848499736aSShri Abhyankar   idx = 0;
3858499736aSShri Abhyankar   for (i=0; i<n; i++) {
3868499736aSShri Abhyankar     v     = aa + bs2*diag[i];
3878499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
3888499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
3898499736aSShri Abhyankar     s1 =  v[0]*x1  +  v[1]*x2  + v[2]*x3  + v[3]*x4;
3908499736aSShri Abhyankar     s2 =  v[4]*x1  +  v[5]*x2  + v[6]*x3  + v[7]*x4;
3918499736aSShri Abhyankar     s3 =  v[8]*x1  +  v[9]*x2  + v[10]*x3 + v[11]*x4;
3928499736aSShri Abhyankar     s4 =  v[12]*x1 +  v[13]*x2 + v[14]*x3 + v[15]*x4;
3938499736aSShri Abhyankar     v -= bs2;
3948499736aSShri Abhyankar 
3958499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
3968499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
3978499736aSShri Abhyankar     for(j=0;j>-nz;j--){
3988499736aSShri Abhyankar       oidx = bs*vi[j];
3998499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4008499736aSShri Abhyankar       x[oidx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4018499736aSShri Abhyankar       x[oidx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4028499736aSShri Abhyankar       x[oidx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4038499736aSShri Abhyankar       v  -= bs2;
4048499736aSShri Abhyankar     }
4058499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4;
4068499736aSShri Abhyankar     idx += bs;
4078499736aSShri Abhyankar   }
4088499736aSShri Abhyankar   /* backward solve the L^T */
4098499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
4108499736aSShri Abhyankar     v    = aa + bs2*ai[i];
4118499736aSShri Abhyankar     vi   = aj + ai[i];
4128499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
4138499736aSShri Abhyankar     idt  = bs*i;
4148499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];
4158499736aSShri Abhyankar     for(j=0;j<nz;j++){
4168499736aSShri Abhyankar       idx   = bs*vi[j];
4178499736aSShri Abhyankar       x[idx]   -=  v[0]*s1  +  v[1]*s2  + v[2]*s3  + v[3]*s4;
4188499736aSShri Abhyankar       x[idx+1] -=  v[4]*s1  +  v[5]*s2  + v[6]*s3  + v[7]*s4;
4198499736aSShri Abhyankar       x[idx+2] -=  v[8]*s1  +  v[9]*s2  + v[10]*s3 + v[11]*s4;
4208499736aSShri Abhyankar       x[idx+3] -=  v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
4218499736aSShri Abhyankar       v += bs2;
4228499736aSShri Abhyankar     }
4238499736aSShri Abhyankar   }
424b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4258499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4268499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4278499736aSShri Abhyankar   PetscFunctionReturn(0);
4288499736aSShri Abhyankar }
4298499736aSShri Abhyankar 
4308499736aSShri Abhyankar #undef __FUNCT__
43106e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace"
43206e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
433f1af5d2fSBarry Smith {
434f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
435dfbe8321SBarry Smith   PetscErrorCode    ierr;
436b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
437b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
438b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
439b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
440b3260449SShri Abhyankar   const PetscScalar *b;
441f1af5d2fSBarry Smith 
442f1af5d2fSBarry Smith   PetscFunctionBegin;
443ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
444b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4451ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
446f1af5d2fSBarry Smith 
447f1af5d2fSBarry Smith   /* forward solve the U^T */
448f1af5d2fSBarry Smith   idx = 0;
449f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
450f1af5d2fSBarry Smith 
451f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
452f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
453ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
454f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
455f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
456f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
457f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
458f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
459f1af5d2fSBarry Smith     v += 25;
460f1af5d2fSBarry Smith 
461f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
462f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
463f1af5d2fSBarry Smith     while (nz--) {
464f1af5d2fSBarry Smith       oidx = 5*(*vi++);
465f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
466f1af5d2fSBarry Smith       x[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
467f1af5d2fSBarry Smith       x[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
468f1af5d2fSBarry Smith       x[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
469f1af5d2fSBarry Smith       x[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
470f1af5d2fSBarry Smith       v  += 25;
471f1af5d2fSBarry Smith     }
472f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
473f1af5d2fSBarry Smith     idx += 5;
474f1af5d2fSBarry Smith   }
475f1af5d2fSBarry Smith   /* backward solve the L^T */
476f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
477f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
478f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
479f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
480f1af5d2fSBarry Smith     idt  = 5*i;
481f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
482f1af5d2fSBarry Smith     while (nz--) {
483f1af5d2fSBarry Smith       idx   = 5*(*vi--);
484f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
485f1af5d2fSBarry Smith       x[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
486f1af5d2fSBarry Smith       x[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
487f1af5d2fSBarry Smith       x[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
488f1af5d2fSBarry Smith       x[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
489f1af5d2fSBarry Smith       v -= 25;
490f1af5d2fSBarry Smith     }
491f1af5d2fSBarry Smith   }
492b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4931ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
494dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
495f1af5d2fSBarry Smith   PetscFunctionReturn(0);
496f1af5d2fSBarry Smith }
497f1af5d2fSBarry Smith 
4984a2ae208SSatish Balay #undef __FUNCT__
4994dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_NaturalOrdering"
5004dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
5018499736aSShri Abhyankar {
5028499736aSShri Abhyankar   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ *)A->data;
5038499736aSShri Abhyankar   PetscErrorCode ierr;
504b3260449SShri Abhyankar   const PetscInt       n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5058499736aSShri Abhyankar   PetscInt       nz,idx,idt,j,i,oidx;
506b3260449SShri Abhyankar   const PetscInt       bs=A->rmap->bs,bs2=a->bs2;
507b3260449SShri Abhyankar   const MatScalar      *aa=a->a,*v;
508b3260449SShri Abhyankar   PetscScalar    s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x;
509b3260449SShri Abhyankar   const PetscScalar    *b;
5108499736aSShri Abhyankar 
5118499736aSShri Abhyankar   PetscFunctionBegin;
5128499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
513b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5148499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5158499736aSShri Abhyankar 
5168499736aSShri Abhyankar   /* forward solve the U^T */
5178499736aSShri Abhyankar   idx = 0;
5188499736aSShri Abhyankar   for (i=0; i<n; i++) {
5198499736aSShri Abhyankar     v     = aa + bs2*diag[i];
5208499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
5218499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
5228499736aSShri Abhyankar     x5 = x[4+idx];
5238499736aSShri Abhyankar     s1 =  v[0]*x1   +  v[1]*x2   + v[2]*x3   + v[3]*x4   + v[4]*x5;
5248499736aSShri Abhyankar     s2 =  v[5]*x1   +  v[6]*x2   + v[7]*x3   + v[8]*x4   + v[9]*x5;
5258499736aSShri Abhyankar     s3 =  v[10]*x1  +  v[11]*x2  + v[12]*x3  + v[13]*x4  + v[14]*x5;
5268499736aSShri Abhyankar     s4 =  v[15]*x1  +  v[16]*x2  + v[17]*x3  + v[18]*x4  + v[19]*x5;
5278499736aSShri Abhyankar     s5 =  v[20]*x1  +  v[21]*x2  + v[22]*x3  + v[23]*x4   + v[24]*x5;
5288499736aSShri Abhyankar     v -= bs2;
5298499736aSShri Abhyankar 
5308499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
5318499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
5328499736aSShri Abhyankar     for(j=0;j>-nz;j--){
5338499736aSShri Abhyankar       oidx = bs*vi[j];
5348499736aSShri Abhyankar       x[oidx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5358499736aSShri Abhyankar       x[oidx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5368499736aSShri Abhyankar       x[oidx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5378499736aSShri Abhyankar       x[oidx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5388499736aSShri Abhyankar       x[oidx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5398499736aSShri Abhyankar       v  -= bs2;
5408499736aSShri Abhyankar     }
5418499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
5428499736aSShri Abhyankar     idx += bs;
5438499736aSShri Abhyankar   }
5448499736aSShri Abhyankar   /* backward solve the L^T */
5458499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
5468499736aSShri Abhyankar     v    = aa + bs2*ai[i];
5478499736aSShri Abhyankar     vi   = aj + ai[i];
5488499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
5498499736aSShri Abhyankar     idt  = bs*i;
5508499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
5518499736aSShri Abhyankar     for(j=0;j<nz;j++){
5528499736aSShri Abhyankar       idx   = bs*vi[j];
5538499736aSShri Abhyankar       x[idx]   -=  v[0]*s1   +  v[1]*s2   + v[2]*s3   + v[3]*s4   + v[4]*s5;
5548499736aSShri Abhyankar       x[idx+1] -=  v[5]*s1   +  v[6]*s2   + v[7]*s3   + v[8]*s4   + v[9]*s5;
5558499736aSShri Abhyankar       x[idx+2] -=  v[10]*s1  +  v[11]*s2  + v[12]*s3  + v[13]*s4  + v[14]*s5;
5568499736aSShri Abhyankar       x[idx+3] -=  v[15]*s1  +  v[16]*s2  + v[17]*s3  + v[18]*s4  + v[19]*s5;
5578499736aSShri Abhyankar       x[idx+4] -=  v[20]*s1  +  v[21]*s2  + v[22]*s3  + v[23]*s4   + v[24]*s5;
5588499736aSShri Abhyankar       v += bs2;
5598499736aSShri Abhyankar     }
5608499736aSShri Abhyankar   }
561b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5628499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5638499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
5648499736aSShri Abhyankar   PetscFunctionReturn(0);
5658499736aSShri Abhyankar }
5668499736aSShri Abhyankar 
5678499736aSShri Abhyankar #undef __FUNCT__
56806e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace"
56906e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
570f1af5d2fSBarry Smith {
571f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
572dfbe8321SBarry Smith   PetscErrorCode    ierr;
573b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
574b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
575b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
576b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
577b3260449SShri Abhyankar   const PetscScalar *b;
578f1af5d2fSBarry Smith 
579f1af5d2fSBarry Smith   PetscFunctionBegin;
580ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
581b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5821ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
583f1af5d2fSBarry Smith 
584f1af5d2fSBarry Smith   /* forward solve the U^T */
585f1af5d2fSBarry Smith   idx = 0;
586f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
587f1af5d2fSBarry Smith 
588f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
589f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
590ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
591ef66eb69SBarry Smith     x6    = x[5+idx];
592f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
593f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
594f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
595f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
596f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
597f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
598f1af5d2fSBarry Smith     v += 36;
599f1af5d2fSBarry Smith 
600f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
601f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
602f1af5d2fSBarry Smith     while (nz--) {
603f1af5d2fSBarry Smith       oidx = 6*(*vi++);
604f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
605f1af5d2fSBarry Smith       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
606f1af5d2fSBarry Smith       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
607f1af5d2fSBarry Smith       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
608f1af5d2fSBarry Smith       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
609f1af5d2fSBarry Smith       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
610f1af5d2fSBarry Smith       v  += 36;
611f1af5d2fSBarry Smith     }
612f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
613f1af5d2fSBarry Smith     x[5+idx] = s6;
614f1af5d2fSBarry Smith     idx += 6;
615f1af5d2fSBarry Smith   }
616f1af5d2fSBarry Smith   /* backward solve the L^T */
617f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
618f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
619f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
620f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
621f1af5d2fSBarry Smith     idt  = 6*i;
622f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
623f1af5d2fSBarry Smith     s6 = x[5+idt];
624f1af5d2fSBarry Smith     while (nz--) {
625f1af5d2fSBarry Smith       idx   = 6*(*vi--);
626f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
627f1af5d2fSBarry Smith       x[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
628f1af5d2fSBarry Smith       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
629f1af5d2fSBarry Smith       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
630f1af5d2fSBarry Smith       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
631f1af5d2fSBarry Smith       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
632f1af5d2fSBarry Smith       v -= 36;
633f1af5d2fSBarry Smith     }
634f1af5d2fSBarry Smith   }
635b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
6361ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
637dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
638f1af5d2fSBarry Smith   PetscFunctionReturn(0);
639f1af5d2fSBarry Smith }
640f1af5d2fSBarry Smith 
6414a2ae208SSatish Balay #undef __FUNCT__
6424dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_NaturalOrdering"
6434dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
6448499736aSShri Abhyankar {
6458499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
6468499736aSShri Abhyankar   PetscErrorCode    ierr;
647b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
6488499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
649b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
650b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
651b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x;
652b3260449SShri Abhyankar   const PetscScalar *b;
6538499736aSShri Abhyankar 
6548499736aSShri Abhyankar   PetscFunctionBegin;
6558499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
656b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
6578499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
6588499736aSShri Abhyankar 
6598499736aSShri Abhyankar   /* forward solve the U^T */
6608499736aSShri Abhyankar   idx = 0;
6618499736aSShri Abhyankar   for (i=0; i<n; i++) {
6628499736aSShri Abhyankar     v     = aa + bs2*diag[i];
6638499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
6648499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
6658499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];
6668499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
6678499736aSShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
6688499736aSShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
6698499736aSShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
6708499736aSShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
6718499736aSShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
6728499736aSShri Abhyankar     v -= bs2;
6738499736aSShri Abhyankar 
6748499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
6758499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
6768499736aSShri Abhyankar     for(j=0;j>-nz;j--){
6778499736aSShri Abhyankar       oidx = bs*vi[j];
6788499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
6798499736aSShri Abhyankar       x[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
6808499736aSShri Abhyankar       x[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
6818499736aSShri Abhyankar       x[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
6828499736aSShri Abhyankar       x[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
6838499736aSShri Abhyankar       x[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
6848499736aSShri Abhyankar       v  -= bs2;
6858499736aSShri Abhyankar     }
6868499736aSShri Abhyankar     x[idx]   = s1;x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
6878499736aSShri Abhyankar     x[5+idx] = s6;
6888499736aSShri Abhyankar     idx += bs;
6898499736aSShri Abhyankar   }
6908499736aSShri Abhyankar   /* backward solve the L^T */
6918499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
6928499736aSShri Abhyankar     v    = aa + bs2*ai[i];
6938499736aSShri Abhyankar     vi   = aj + ai[i];
6948499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
6958499736aSShri Abhyankar     idt  = bs*i;
6968499736aSShri Abhyankar     s1   = x[idt];  s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
6978499736aSShri Abhyankar     s6   = x[5+idt];
6988499736aSShri Abhyankar     for(j=0;j<nz;j++){
6998499736aSShri Abhyankar       idx   = bs*vi[j];
7008499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
7018499736aSShri Abhyankar       x[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
7028499736aSShri Abhyankar       x[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
7038499736aSShri Abhyankar       x[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
7048499736aSShri Abhyankar       x[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
7058499736aSShri Abhyankar       x[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
7068499736aSShri Abhyankar       v += bs2;
7078499736aSShri Abhyankar     }
7088499736aSShri Abhyankar   }
709b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
7108499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
7118499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
7128499736aSShri Abhyankar   PetscFunctionReturn(0);
7138499736aSShri Abhyankar }
7148499736aSShri Abhyankar 
7158499736aSShri Abhyankar #undef __FUNCT__
71606e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace"
71706e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
718f1af5d2fSBarry Smith {
719f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
720dfbe8321SBarry Smith   PetscErrorCode    ierr;
721b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
722b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,oidx;
723b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
724b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
725b3260449SShri Abhyankar   const PetscScalar *b;
726f1af5d2fSBarry Smith 
727f1af5d2fSBarry Smith   PetscFunctionBegin;
728ef66eb69SBarry Smith   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
729b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
7301ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
731f1af5d2fSBarry Smith 
732f1af5d2fSBarry Smith   /* forward solve the U^T */
733f1af5d2fSBarry Smith   idx = 0;
734f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
735f1af5d2fSBarry Smith 
736f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
737f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
738ef66eb69SBarry Smith     x1    = x[idx];   x2 = x[1+idx]; x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
739ef66eb69SBarry Smith     x6    = x[5+idx]; x7 = x[6+idx];
740f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
741f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
742f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
743f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
744f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
745f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
746f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
747f1af5d2fSBarry Smith     v += 49;
748f1af5d2fSBarry Smith 
749f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
750f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
751f1af5d2fSBarry Smith     while (nz--) {
752f1af5d2fSBarry Smith       oidx = 7*(*vi++);
753f1af5d2fSBarry Smith       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
754f1af5d2fSBarry Smith       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
755f1af5d2fSBarry Smith       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
756f1af5d2fSBarry Smith       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
757f1af5d2fSBarry Smith       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
758f1af5d2fSBarry Smith       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
759f1af5d2fSBarry Smith       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
760f1af5d2fSBarry Smith       v  += 49;
761f1af5d2fSBarry Smith     }
762f1af5d2fSBarry Smith     x[idx]   = s1;x[1+idx] = s2; x[2+idx] = s3;x[3+idx] = s4; x[4+idx] = s5;
763f1af5d2fSBarry Smith     x[5+idx] = s6;x[6+idx] = s7;
764f1af5d2fSBarry Smith     idx += 7;
765f1af5d2fSBarry Smith   }
766f1af5d2fSBarry Smith   /* backward solve the L^T */
767f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
768f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
769f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
770f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
771f1af5d2fSBarry Smith     idt  = 7*i;
772f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt]; s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
773f1af5d2fSBarry Smith     s6 = x[5+idt];s7 = x[6+idt];
774f1af5d2fSBarry Smith     while (nz--) {
775f1af5d2fSBarry Smith       idx   = 7*(*vi--);
776f1af5d2fSBarry Smith       x[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
777f1af5d2fSBarry Smith       x[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
778f1af5d2fSBarry Smith       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
779f1af5d2fSBarry Smith       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
780f1af5d2fSBarry Smith       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
781f1af5d2fSBarry Smith       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
782f1af5d2fSBarry Smith       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
783f1af5d2fSBarry Smith       v -= 49;
784f1af5d2fSBarry Smith     }
785f1af5d2fSBarry Smith   }
786b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
7871ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
788dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
789f1af5d2fSBarry Smith   PetscFunctionReturn(0);
790f1af5d2fSBarry Smith }
7918499736aSShri Abhyankar #undef __FUNCT__
7924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_NaturalOrdering"
7934dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
7948499736aSShri Abhyankar {
7958499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
7968499736aSShri Abhyankar   PetscErrorCode    ierr;
797b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
7988499736aSShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx;
799b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
800b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
801b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x;
802b3260449SShri Abhyankar   const PetscScalar *b;
8038499736aSShri Abhyankar 
8048499736aSShri Abhyankar   PetscFunctionBegin;
8058499736aSShri Abhyankar   ierr = VecCopy(bb,xx);CHKERRQ(ierr);
806b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
8078499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
8088499736aSShri Abhyankar 
8098499736aSShri Abhyankar   /* forward solve the U^T */
8108499736aSShri Abhyankar   idx = 0;
8118499736aSShri Abhyankar   for (i=0; i<n; i++) {
8128499736aSShri Abhyankar     v     = aa + bs2*diag[i];
8138499736aSShri Abhyankar     /* multiply by the inverse of the block diagonal */
8148499736aSShri Abhyankar     x1 = x[idx];   x2 = x[1+idx];  x3 = x[2+idx];  x4 = x[3+idx];
8158499736aSShri Abhyankar     x5 = x[4+idx]; x6 = x[5+idx];  x7 = x[6+idx];
8168499736aSShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
8178499736aSShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
8188499736aSShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
8198499736aSShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
8208499736aSShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
8218499736aSShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
8228499736aSShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
8238499736aSShri Abhyankar     v -= bs2;
8248499736aSShri Abhyankar     vi    = aj + diag[i] - 1;
8258499736aSShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
8268499736aSShri Abhyankar     for(j=0;j>-nz;j--){
8278499736aSShri Abhyankar       oidx = bs*vi[j];
8288499736aSShri Abhyankar       x[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8298499736aSShri Abhyankar       x[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8308499736aSShri Abhyankar       x[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8318499736aSShri Abhyankar       x[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8328499736aSShri Abhyankar       x[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8338499736aSShri Abhyankar       x[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8348499736aSShri Abhyankar       x[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8358499736aSShri Abhyankar       v  -= bs2;
8368499736aSShri Abhyankar     }
8378499736aSShri Abhyankar     x[idx]   = s1;  x[1+idx] = s2;  x[2+idx] = s3;  x[3+idx] = s4; x[4+idx] = s5;
8388499736aSShri Abhyankar     x[5+idx] = s6;  x[6+idx] = s7;
8398499736aSShri Abhyankar     idx += bs;
8408499736aSShri Abhyankar   }
8418499736aSShri Abhyankar   /* backward solve the L^T */
8428499736aSShri Abhyankar   for (i=n-1; i>=0; i--){
8438499736aSShri Abhyankar     v    = aa + bs2*ai[i];
8448499736aSShri Abhyankar     vi   = aj + ai[i];
8458499736aSShri Abhyankar     nz   = ai[i+1] - ai[i];
8468499736aSShri Abhyankar     idt  = bs*i;
8478499736aSShri Abhyankar     s1   = x[idt];    s2 = x[1+idt];  s3 = x[2+idt];  s4 = x[3+idt];  s5 = x[4+idt];
8488499736aSShri Abhyankar     s6   = x[5+idt];  s7 = x[6+idt];
8498499736aSShri Abhyankar     for(j=0;j<nz;j++){
8508499736aSShri Abhyankar       idx   = bs*vi[j];
8518499736aSShri Abhyankar       x[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
8528499736aSShri Abhyankar       x[idx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
8538499736aSShri Abhyankar       x[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
8548499736aSShri Abhyankar       x[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
8558499736aSShri Abhyankar       x[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
8568499736aSShri Abhyankar       x[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
8578499736aSShri Abhyankar       x[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
8588499736aSShri Abhyankar       v += bs2;
8598499736aSShri Abhyankar     }
8608499736aSShri Abhyankar   }
861b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
8628499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
8638499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
8648499736aSShri Abhyankar   PetscFunctionReturn(0);
8658499736aSShri Abhyankar }
866f1af5d2fSBarry Smith 
867f1af5d2fSBarry Smith /*---------------------------------------------------------------------------------------------*/
8684a2ae208SSatish Balay #undef __FUNCT__
86906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_1_inplace"
87006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
871f1af5d2fSBarry Smith {
872f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
873f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
8746849ba73SBarry Smith   PetscErrorCode    ierr;
8755d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
876b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
877b3260449SShri Abhyankar   PetscInt          i,nz;
878b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
879b3260449SShri Abhyankar   PetscScalar       s1,*x,*t;
880b3260449SShri Abhyankar   const PetscScalar *b;
881f1af5d2fSBarry Smith 
882f1af5d2fSBarry Smith   PetscFunctionBegin;
883b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
8841ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
885f1af5d2fSBarry Smith   t  = a->solve_work;
886f1af5d2fSBarry Smith 
887f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
888f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
889f1af5d2fSBarry Smith 
890f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
891f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
892f1af5d2fSBarry Smith     t[i] = b[c[i]];
893f1af5d2fSBarry Smith   }
894f1af5d2fSBarry Smith 
895f1af5d2fSBarry Smith   /* forward solve the U^T */
896f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
897f1af5d2fSBarry Smith 
898f1af5d2fSBarry Smith     v     = aa + diag[i];
899f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
900f1af5d2fSBarry Smith     s1    = (*v++)*t[i];
901f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
902f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
903f1af5d2fSBarry Smith     while (nz--) {
904f1af5d2fSBarry Smith       t[*vi++]  -= (*v++)*s1;
905f1af5d2fSBarry Smith     }
906f1af5d2fSBarry Smith     t[i]   = s1;
907f1af5d2fSBarry Smith   }
908f1af5d2fSBarry Smith   /* backward solve the L^T */
909f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
910f1af5d2fSBarry Smith     v    = aa + diag[i] - 1;
911f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
912f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
913f1af5d2fSBarry Smith     s1   = t[i];
914f1af5d2fSBarry Smith     while (nz--) {
915f1af5d2fSBarry Smith       t[*vi--]   -=  (*v--)*s1;
916f1af5d2fSBarry Smith     }
917f1af5d2fSBarry Smith   }
918f1af5d2fSBarry Smith 
919f1af5d2fSBarry Smith   /* copy t into x according to permutation */
920f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
921f1af5d2fSBarry Smith     x[r[i]]   = t[i];
922f1af5d2fSBarry Smith   }
923f1af5d2fSBarry Smith 
924f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
925f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
926b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
9271ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
928dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
929f1af5d2fSBarry Smith   PetscFunctionReturn(0);
930f1af5d2fSBarry Smith }
931f1af5d2fSBarry Smith 
9324a2ae208SSatish Balay #undef __FUNCT__
93306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2_inplace"
93406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
935f1af5d2fSBarry Smith {
936f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
937f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
9386849ba73SBarry Smith   PetscErrorCode    ierr;
9395d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
940b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
941b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
942b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
943b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
944b3260449SShri Abhyankar   const PetscScalar *b;
945f1af5d2fSBarry Smith 
946f1af5d2fSBarry Smith   PetscFunctionBegin;
947b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
9481ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
949f1af5d2fSBarry Smith   t  = a->solve_work;
950f1af5d2fSBarry Smith 
951f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
952f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
953f1af5d2fSBarry Smith 
954f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
955f1af5d2fSBarry Smith   ii = 0;
956f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
957f1af5d2fSBarry Smith     ic      = 2*c[i];
958f1af5d2fSBarry Smith     t[ii]   = b[ic];
959f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
960f1af5d2fSBarry Smith     ii += 2;
961f1af5d2fSBarry Smith   }
962f1af5d2fSBarry Smith 
963f1af5d2fSBarry Smith   /* forward solve the U^T */
964f1af5d2fSBarry Smith   idx = 0;
965f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
966f1af5d2fSBarry Smith 
967f1af5d2fSBarry Smith     v     = aa + 4*diag[i];
968f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
969f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx];
970f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2;
971f1af5d2fSBarry Smith     s2 = v[2]*x1  +  v[3]*x2;
972f1af5d2fSBarry Smith     v += 4;
973f1af5d2fSBarry Smith 
974f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
975f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
976f1af5d2fSBarry Smith     while (nz--) {
977f1af5d2fSBarry Smith       oidx = 2*(*vi++);
978f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2;
979f1af5d2fSBarry Smith       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
980f1af5d2fSBarry Smith       v  += 4;
981f1af5d2fSBarry Smith     }
982f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
983f1af5d2fSBarry Smith     idx += 2;
984f1af5d2fSBarry Smith   }
985f1af5d2fSBarry Smith   /* backward solve the L^T */
986f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
987f1af5d2fSBarry Smith     v    = aa + 4*diag[i] - 4;
988f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
989f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
990f1af5d2fSBarry Smith     idt  = 2*i;
991f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
992f1af5d2fSBarry Smith     while (nz--) {
993f1af5d2fSBarry Smith       idx   = 2*(*vi--);
994f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2;
995f1af5d2fSBarry Smith       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
996f1af5d2fSBarry Smith       v -= 4;
997f1af5d2fSBarry Smith     }
998f1af5d2fSBarry Smith   }
999f1af5d2fSBarry Smith 
1000f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1001f1af5d2fSBarry Smith   ii = 0;
1002f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1003f1af5d2fSBarry Smith     ir      = 2*r[i];
1004f1af5d2fSBarry Smith     x[ir]   = t[ii];
1005f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1006f1af5d2fSBarry Smith     ii += 2;
1007f1af5d2fSBarry Smith   }
1008f1af5d2fSBarry Smith 
1009f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1010f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1011b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
10121ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1013dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
1014f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1015f1af5d2fSBarry Smith }
1016f1af5d2fSBarry Smith 
10174a2ae208SSatish Balay #undef __FUNCT__
10184dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_2"
10194dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
102032121132SShri Abhyankar {
102132121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
102232121132SShri Abhyankar   PetscErrorCode    ierr;
102332121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1024b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
102532121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
102632121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1027b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1028b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1029b3260449SShri Abhyankar   PetscScalar       s1,s2,x1,x2,*x,*t;
1030b3260449SShri Abhyankar   const PetscScalar *b;
103132121132SShri Abhyankar 
103232121132SShri Abhyankar   PetscFunctionBegin;
1033b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
103432121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
103532121132SShri Abhyankar   t = a->solve_work;
103632121132SShri Abhyankar 
103732121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
103832121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
103932121132SShri Abhyankar 
104032121132SShri Abhyankar   /* copy b into temp work space according to permutation */
104132121132SShri Abhyankar   for(i=0;i<n;i++){
104232121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
104332121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1];
104432121132SShri Abhyankar   }
104532121132SShri Abhyankar 
104632121132SShri Abhyankar   /* forward solve the U^T */
104732121132SShri Abhyankar   idx = 0;
104832121132SShri Abhyankar   for (i=0; i<n; i++) {
104932121132SShri Abhyankar     v     = aa + bs2*diag[i];
105032121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
105132121132SShri Abhyankar     x1 = t[idx];   x2 = t[1+idx];
105232121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2;
105332121132SShri Abhyankar     s2 = v[2]*x1  +  v[3]*x2;
105432121132SShri Abhyankar     v -= bs2;
105532121132SShri Abhyankar 
105632121132SShri Abhyankar     vi    = aj + diag[i] - 1;
105732121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
105832121132SShri Abhyankar     for(j=0;j>-nz;j--){
105932121132SShri Abhyankar       oidx = bs*vi[j];
106032121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2;
106132121132SShri Abhyankar       t[oidx+1] -= v[2]*s1  +  v[3]*s2;
106232121132SShri Abhyankar       v  -= bs2;
106332121132SShri Abhyankar     }
106432121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
106532121132SShri Abhyankar     idx += bs;
106632121132SShri Abhyankar   }
106732121132SShri Abhyankar   /* backward solve the L^T */
106832121132SShri Abhyankar   for (i=n-1; i>=0; i--){
106932121132SShri Abhyankar     v    = aa + bs2*ai[i];
107032121132SShri Abhyankar     vi   = aj + ai[i];
107132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
107232121132SShri Abhyankar     idt  = bs*i;
107332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];
107432121132SShri Abhyankar     for(j=0;j<nz;j++){
107532121132SShri Abhyankar       idx   = bs*vi[j];
107632121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2;
107732121132SShri Abhyankar       t[idx+1] -=  v[2]*s1 +  v[3]*s2;
107832121132SShri Abhyankar       v += bs2;
107932121132SShri Abhyankar     }
108032121132SShri Abhyankar   }
108132121132SShri Abhyankar 
108232121132SShri Abhyankar   /* copy t into x according to permutation */
108332121132SShri Abhyankar   for(i=0;i<n;i++){
108432121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
108532121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1];
108632121132SShri Abhyankar   }
108732121132SShri Abhyankar 
108832121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
108932121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1090b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
109132121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
109232121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
109332121132SShri Abhyankar   PetscFunctionReturn(0);
109432121132SShri Abhyankar }
109532121132SShri Abhyankar 
109632121132SShri Abhyankar #undef __FUNCT__
109706e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3_inplace"
109806e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
1099f1af5d2fSBarry Smith {
1100f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1101f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
11026849ba73SBarry Smith   PetscErrorCode    ierr;
11035d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1104b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1105b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1106b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1107b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1108b3260449SShri Abhyankar   const PetscScalar *b;
1109f1af5d2fSBarry Smith 
1110f1af5d2fSBarry Smith   PetscFunctionBegin;
1111b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
11121ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1113f1af5d2fSBarry Smith   t  = a->solve_work;
1114f1af5d2fSBarry Smith 
1115f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1116f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1117f1af5d2fSBarry Smith 
1118f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1119f1af5d2fSBarry Smith   ii = 0;
1120f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1121f1af5d2fSBarry Smith     ic      = 3*c[i];
1122f1af5d2fSBarry Smith     t[ii]   = b[ic];
1123f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1124f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1125f1af5d2fSBarry Smith     ii += 3;
1126f1af5d2fSBarry Smith   }
1127f1af5d2fSBarry Smith 
1128f1af5d2fSBarry Smith   /* forward solve the U^T */
1129f1af5d2fSBarry Smith   idx = 0;
1130f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1131f1af5d2fSBarry Smith 
1132f1af5d2fSBarry Smith     v     = aa + 9*diag[i];
1133f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1134f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
1135f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
1136f1af5d2fSBarry Smith     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
1137f1af5d2fSBarry Smith     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
1138f1af5d2fSBarry Smith     v += 9;
1139f1af5d2fSBarry Smith 
1140f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1141f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1142f1af5d2fSBarry Smith     while (nz--) {
1143f1af5d2fSBarry Smith       oidx = 3*(*vi++);
1144f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
1145f1af5d2fSBarry Smith       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
1146f1af5d2fSBarry Smith       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1147f1af5d2fSBarry Smith       v  += 9;
1148f1af5d2fSBarry Smith     }
1149f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;
1150f1af5d2fSBarry Smith     idx += 3;
1151f1af5d2fSBarry Smith   }
1152f1af5d2fSBarry Smith   /* backward solve the L^T */
1153f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1154f1af5d2fSBarry Smith     v    = aa + 9*diag[i] - 9;
1155f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1156f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1157f1af5d2fSBarry Smith     idt  = 3*i;
1158f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];
1159f1af5d2fSBarry Smith     while (nz--) {
1160f1af5d2fSBarry Smith       idx   = 3*(*vi--);
1161f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3;
1162f1af5d2fSBarry Smith       t[idx+1] -=  v[3]*s1 +  v[4]*s2 +  v[5]*s3;
1163f1af5d2fSBarry Smith       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
1164f1af5d2fSBarry Smith       v -= 9;
1165f1af5d2fSBarry Smith     }
1166f1af5d2fSBarry Smith   }
1167f1af5d2fSBarry Smith 
1168f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1169f1af5d2fSBarry Smith   ii = 0;
1170f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1171f1af5d2fSBarry Smith     ir      = 3*r[i];
1172f1af5d2fSBarry Smith     x[ir]   = t[ii];
1173f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1174f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1175f1af5d2fSBarry Smith     ii += 3;
1176f1af5d2fSBarry Smith   }
1177f1af5d2fSBarry Smith 
1178f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1179f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1180b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
11811ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1182dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
1183f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1184f1af5d2fSBarry Smith }
1185f1af5d2fSBarry Smith 
11864a2ae208SSatish Balay #undef __FUNCT__
11874dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_3"
11884dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
118932121132SShri Abhyankar {
119032121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
119132121132SShri Abhyankar   PetscErrorCode    ierr;
119232121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1193b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
119432121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
119532121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1196b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1197b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1198b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,x1,x2,x3,*x,*t;
1199b3260449SShri Abhyankar   const PetscScalar *b;
120032121132SShri Abhyankar 
120132121132SShri Abhyankar   PetscFunctionBegin;
1202b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
120332121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
120432121132SShri Abhyankar   t = a->solve_work;
120532121132SShri Abhyankar 
120632121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
120732121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
120832121132SShri Abhyankar 
120932121132SShri Abhyankar   /* copy b into temp work space according to permutation */
121032121132SShri Abhyankar   for(i=0;i<n;i++){
121132121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
121232121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2];
121332121132SShri Abhyankar   }
121432121132SShri Abhyankar 
121532121132SShri Abhyankar   /* forward solve the U^T */
121632121132SShri Abhyankar   idx = 0;
121732121132SShri Abhyankar   for (i=0; i<n; i++) {
121832121132SShri Abhyankar     v     = aa + bs2*diag[i];
121932121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
122032121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];
122132121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3;
122232121132SShri Abhyankar     s2 = v[3]*x1  +  v[4]*x2 +  v[5]*x3;
122332121132SShri Abhyankar     s3 = v[6]*x1  +  v[7]*x2 + v[8]*x3;
122432121132SShri Abhyankar     v -= bs2;
122532121132SShri Abhyankar 
122632121132SShri Abhyankar     vi    = aj + diag[i] - 1;
122732121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
122832121132SShri Abhyankar     for(j=0;j>-nz;j--){
122932121132SShri Abhyankar       oidx = bs*vi[j];
123032121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
123132121132SShri Abhyankar       t[oidx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
123232121132SShri Abhyankar       t[oidx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
123332121132SShri Abhyankar       v  -= bs2;
123432121132SShri Abhyankar     }
123532121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;
123632121132SShri Abhyankar     idx += bs;
123732121132SShri Abhyankar   }
123832121132SShri Abhyankar   /* backward solve the L^T */
123932121132SShri Abhyankar   for (i=n-1; i>=0; i--){
124032121132SShri Abhyankar     v    = aa + bs2*ai[i];
124132121132SShri Abhyankar     vi   = aj + ai[i];
124232121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
124332121132SShri Abhyankar     idt  = bs*i;
124432121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];
124532121132SShri Abhyankar     for(j=0;j<nz;j++){
124632121132SShri Abhyankar       idx   = bs*vi[j];
124732121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3;
124832121132SShri Abhyankar       t[idx+1] -= v[3]*s1  +  v[4]*s2 +  v[5]*s3;
124932121132SShri Abhyankar       t[idx+2] -= v[6]*s1 + v[7]*s2 + v[8]*s3;
125032121132SShri Abhyankar       v += bs2;
125132121132SShri Abhyankar     }
125232121132SShri Abhyankar   }
125332121132SShri Abhyankar 
125432121132SShri Abhyankar   /* copy t into x according to permutation */
125532121132SShri Abhyankar   for(i=0;i<n;i++){
125632121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
125732121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];
125832121132SShri Abhyankar   }
125932121132SShri Abhyankar 
126032121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
126132121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1262b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
126332121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
126432121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
126532121132SShri Abhyankar   PetscFunctionReturn(0);
126632121132SShri Abhyankar }
126732121132SShri Abhyankar 
126832121132SShri Abhyankar #undef __FUNCT__
126906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4_inplace"
127006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
1271f1af5d2fSBarry Smith {
1272f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1273f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
12746849ba73SBarry Smith   PetscErrorCode    ierr;
12755d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1276b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1277b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1278b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1279b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1280b3260449SShri Abhyankar   const PetscScalar *b;
1281f1af5d2fSBarry Smith 
1282f1af5d2fSBarry Smith   PetscFunctionBegin;
1283b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
12841ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1285f1af5d2fSBarry Smith   t  = a->solve_work;
1286f1af5d2fSBarry Smith 
1287f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1288f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1289f1af5d2fSBarry Smith 
1290f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1291f1af5d2fSBarry Smith   ii = 0;
1292f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1293f1af5d2fSBarry Smith     ic      = 4*c[i];
1294f1af5d2fSBarry Smith     t[ii]   = b[ic];
1295f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1296f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1297f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1298f1af5d2fSBarry Smith     ii += 4;
1299f1af5d2fSBarry Smith   }
1300f1af5d2fSBarry Smith 
1301f1af5d2fSBarry Smith   /* forward solve the U^T */
1302f1af5d2fSBarry Smith   idx = 0;
1303f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1304f1af5d2fSBarry Smith 
1305f1af5d2fSBarry Smith     v     = aa + 16*diag[i];
1306f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1307f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx];
1308f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
1309f1af5d2fSBarry Smith     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
1310f1af5d2fSBarry Smith     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
1311f1af5d2fSBarry Smith     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1312f1af5d2fSBarry Smith     v += 16;
1313f1af5d2fSBarry Smith 
1314f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1315f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1316f1af5d2fSBarry Smith     while (nz--) {
1317f1af5d2fSBarry Smith       oidx = 4*(*vi++);
1318f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1319f1af5d2fSBarry Smith       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1320f1af5d2fSBarry Smith       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1321f1af5d2fSBarry Smith       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1322f1af5d2fSBarry Smith       v  += 16;
1323f1af5d2fSBarry Smith     }
1324f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4;
1325f1af5d2fSBarry Smith     idx += 4;
1326f1af5d2fSBarry Smith   }
1327f1af5d2fSBarry Smith   /* backward solve the L^T */
1328f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1329f1af5d2fSBarry Smith     v    = aa + 16*diag[i] - 16;
1330f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1331f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1332f1af5d2fSBarry Smith     idt  = 4*i;
1333f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt];
1334f1af5d2fSBarry Smith     while (nz--) {
1335f1af5d2fSBarry Smith       idx   = 4*(*vi--);
1336f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
1337f1af5d2fSBarry Smith       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
1338f1af5d2fSBarry Smith       t[idx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
1339f1af5d2fSBarry Smith       t[idx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
1340f1af5d2fSBarry Smith       v -= 16;
1341f1af5d2fSBarry Smith     }
1342f1af5d2fSBarry Smith   }
1343f1af5d2fSBarry Smith 
1344f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1345f1af5d2fSBarry Smith   ii = 0;
1346f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1347f1af5d2fSBarry Smith     ir      = 4*r[i];
1348f1af5d2fSBarry Smith     x[ir]   = t[ii];
1349f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1350f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1351f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1352f1af5d2fSBarry Smith     ii += 4;
1353f1af5d2fSBarry Smith   }
1354f1af5d2fSBarry Smith 
1355f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1356f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1357b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
13581ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1359dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
1360f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1361f1af5d2fSBarry Smith }
1362f1af5d2fSBarry Smith 
13634a2ae208SSatish Balay #undef __FUNCT__
13644dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_4"
13654dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
136632121132SShri Abhyankar {
136732121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
136832121132SShri Abhyankar   PetscErrorCode    ierr;
136932121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1370b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
137132121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
137232121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1373b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1374b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1375b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4,*x,*t;
1376b3260449SShri Abhyankar   const PetscScalar *b;
137732121132SShri Abhyankar 
137832121132SShri Abhyankar   PetscFunctionBegin;
1379b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
138032121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
138132121132SShri Abhyankar   t = a->solve_work;
138232121132SShri Abhyankar 
138332121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
138432121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
138532121132SShri Abhyankar 
138632121132SShri Abhyankar   /* copy b into temp work space according to permutation */
138732121132SShri Abhyankar   for(i=0;i<n;i++){
138832121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
138932121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
139032121132SShri Abhyankar   }
139132121132SShri Abhyankar 
139232121132SShri Abhyankar   /* forward solve the U^T */
139332121132SShri Abhyankar   idx = 0;
139432121132SShri Abhyankar   for (i=0; i<n; i++) {
139532121132SShri Abhyankar     v     = aa + bs2*diag[i];
139632121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
139732121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx];  x4 = t[3+idx];
139832121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4;
139932121132SShri Abhyankar     s2 = v[4]*x1  +  v[5]*x2 +  v[6]*x3 +  v[7]*x4;
140032121132SShri Abhyankar     s3 = v[8]*x1  +  v[9]*x2 + v[10]*x3 + v[11]*x4;
140132121132SShri Abhyankar     s4 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
140232121132SShri Abhyankar     v -= bs2;
140332121132SShri Abhyankar 
140432121132SShri Abhyankar     vi    = aj + diag[i] - 1;
140532121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
140632121132SShri Abhyankar     for(j=0;j>-nz;j--){
140732121132SShri Abhyankar       oidx = bs*vi[j];
140832121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4;
140932121132SShri Abhyankar       t[oidx+1] -= v[4]*s1  +  v[5]*s2 +  v[6]*s3 +  v[7]*s4;
141032121132SShri Abhyankar       t[oidx+2] -= v[8]*s1 + v[9]*s2 + v[10]*s3 + v[11]*s4;
141132121132SShri Abhyankar       t[oidx+3] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4;
141232121132SShri Abhyankar       v  -= bs2;
141332121132SShri Abhyankar     }
141432121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4;
141532121132SShri Abhyankar     idx += bs;
141632121132SShri Abhyankar   }
141732121132SShri Abhyankar   /* backward solve the L^T */
141832121132SShri Abhyankar   for (i=n-1; i>=0; i--){
141932121132SShri Abhyankar     v    = aa + bs2*ai[i];
142032121132SShri Abhyankar     vi   = aj + ai[i];
142132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
142232121132SShri Abhyankar     idt  = bs*i;
142332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt];
142432121132SShri Abhyankar     for(j=0;j<nz;j++){
142532121132SShri Abhyankar       idx   = bs*vi[j];
142632121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3  +  v[3]*s4;
142732121132SShri Abhyankar       t[idx+1] -=  v[4]*s1 +  v[5]*s2 +  v[6]*s3  +  v[7]*s4;
142832121132SShri Abhyankar       t[idx+2] -=  v[8]*s1 +  v[9]*s2 +  v[10]*s3 + v[11]*s4;
142932121132SShri Abhyankar       t[idx+3] -= v[12]*s1 +  v[13]*s2 + v[14]*s3 + v[15]*s4;
143032121132SShri Abhyankar       v += bs2;
143132121132SShri Abhyankar     }
143232121132SShri Abhyankar   }
143332121132SShri Abhyankar 
143432121132SShri Abhyankar   /* copy t into x according to permutation */
143532121132SShri Abhyankar   for(i=0;i<n;i++){
143632121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
143732121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
143832121132SShri Abhyankar   }
143932121132SShri Abhyankar 
144032121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
144132121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1442b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
144332121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
144432121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
144532121132SShri Abhyankar   PetscFunctionReturn(0);
144632121132SShri Abhyankar }
144732121132SShri Abhyankar 
144832121132SShri Abhyankar #undef __FUNCT__
144906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5_inplace"
145006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
1451f1af5d2fSBarry Smith {
1452f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1453f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
14546849ba73SBarry Smith   PetscErrorCode    ierr;
14555d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1456b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1457b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1458b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1459b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1460b3260449SShri Abhyankar   const PetscScalar *b;
1461f1af5d2fSBarry Smith 
1462f1af5d2fSBarry Smith   PetscFunctionBegin;
1463b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
14641ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1465f1af5d2fSBarry Smith   t  = a->solve_work;
1466f1af5d2fSBarry Smith 
1467f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1468f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1469f1af5d2fSBarry Smith 
1470f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1471f1af5d2fSBarry Smith   ii = 0;
1472f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1473f1af5d2fSBarry Smith     ic      = 5*c[i];
1474f1af5d2fSBarry Smith     t[ii]   = b[ic];
1475f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1476f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1477f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1478f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1479f1af5d2fSBarry Smith     ii += 5;
1480f1af5d2fSBarry Smith   }
1481f1af5d2fSBarry Smith 
1482f1af5d2fSBarry Smith   /* forward solve the U^T */
1483f1af5d2fSBarry Smith   idx = 0;
1484f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1485f1af5d2fSBarry Smith 
1486f1af5d2fSBarry Smith     v     = aa + 25*diag[i];
1487f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1488f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1489f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
1490f1af5d2fSBarry Smith     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
1491f1af5d2fSBarry Smith     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1492f1af5d2fSBarry Smith     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1493f1af5d2fSBarry Smith     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1494f1af5d2fSBarry Smith     v += 25;
1495f1af5d2fSBarry Smith 
1496f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1497f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1498f1af5d2fSBarry Smith     while (nz--) {
1499f1af5d2fSBarry Smith       oidx = 5*(*vi++);
1500f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1501f1af5d2fSBarry Smith       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1502f1af5d2fSBarry Smith       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1503f1af5d2fSBarry Smith       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1504f1af5d2fSBarry Smith       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1505f1af5d2fSBarry Smith       v  += 25;
1506f1af5d2fSBarry Smith     }
1507f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1508f1af5d2fSBarry Smith     idx += 5;
1509f1af5d2fSBarry Smith   }
1510f1af5d2fSBarry Smith   /* backward solve the L^T */
1511f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1512f1af5d2fSBarry Smith     v    = aa + 25*diag[i] - 25;
1513f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1514f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1515f1af5d2fSBarry Smith     idt  = 5*i;
1516f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1517f1af5d2fSBarry Smith     while (nz--) {
1518f1af5d2fSBarry Smith       idx   = 5*(*vi--);
1519f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
1520f1af5d2fSBarry Smith       t[idx+1] -=  v[5]*s1 +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
1521f1af5d2fSBarry Smith       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
1522f1af5d2fSBarry Smith       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
1523f1af5d2fSBarry Smith       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
1524f1af5d2fSBarry Smith       v -= 25;
1525f1af5d2fSBarry Smith     }
1526f1af5d2fSBarry Smith   }
1527f1af5d2fSBarry Smith 
1528f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1529f1af5d2fSBarry Smith   ii = 0;
1530f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1531f1af5d2fSBarry Smith     ir      = 5*r[i];
1532f1af5d2fSBarry Smith     x[ir]   = t[ii];
1533f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1534f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1535f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1536f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1537f1af5d2fSBarry Smith     ii += 5;
1538f1af5d2fSBarry Smith   }
1539f1af5d2fSBarry Smith 
1540f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1541f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1542b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
15431ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1544dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
1545f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1546f1af5d2fSBarry Smith }
1547f1af5d2fSBarry Smith 
15484a2ae208SSatish Balay #undef __FUNCT__
15494dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_5"
15504dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
155132121132SShri Abhyankar {
155232121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
155332121132SShri Abhyankar   PetscErrorCode    ierr;
155432121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1555b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
155632121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
155732121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1558b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1559b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1560b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*x,*t;
1561b3260449SShri Abhyankar   const PetscScalar *b;
156232121132SShri Abhyankar 
156332121132SShri Abhyankar   PetscFunctionBegin;
1564b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
156532121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
156632121132SShri Abhyankar   t = a->solve_work;
156732121132SShri Abhyankar 
156832121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
156932121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
157032121132SShri Abhyankar 
157132121132SShri Abhyankar   /* copy b into temp work space according to permutation */
157232121132SShri Abhyankar   for(i=0;i<n;i++){
157332121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
157432121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
157532121132SShri Abhyankar     t[ii+4] = b[ic+4];
157632121132SShri Abhyankar   }
157732121132SShri Abhyankar 
157832121132SShri Abhyankar   /* forward solve the U^T */
157932121132SShri Abhyankar   idx = 0;
158032121132SShri Abhyankar   for (i=0; i<n; i++) {
158132121132SShri Abhyankar     v     = aa + bs2*diag[i];
158232121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
158332121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
158432121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5;
158532121132SShri Abhyankar     s2 = v[5]*x1  +  v[6]*x2 +  v[7]*x3 +  v[8]*x4 +  v[9]*x5;
158632121132SShri Abhyankar     s3 = v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
158732121132SShri Abhyankar     s4 = v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
158832121132SShri Abhyankar     s5 = v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
158932121132SShri Abhyankar     v -= bs2;
159032121132SShri Abhyankar 
159132121132SShri Abhyankar     vi    = aj + diag[i] - 1;
159232121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
159332121132SShri Abhyankar     for(j=0;j>-nz;j--){
159432121132SShri Abhyankar       oidx = bs*vi[j];
159532121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
159632121132SShri Abhyankar       t[oidx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
159732121132SShri Abhyankar       t[oidx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
159832121132SShri Abhyankar       t[oidx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
159932121132SShri Abhyankar       t[oidx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
160032121132SShri Abhyankar       v  -= bs2;
160132121132SShri Abhyankar     }
160232121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
160332121132SShri Abhyankar     idx += bs;
160432121132SShri Abhyankar   }
160532121132SShri Abhyankar   /* backward solve the L^T */
160632121132SShri Abhyankar   for (i=n-1; i>=0; i--){
160732121132SShri Abhyankar     v    = aa + bs2*ai[i];
160832121132SShri Abhyankar     vi   = aj + ai[i];
160932121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
161032121132SShri Abhyankar     idt  = bs*i;
161132121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
161232121132SShri Abhyankar     for(j=0;j<nz;j++){
161332121132SShri Abhyankar       idx   = bs*vi[j];
161432121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5;
161532121132SShri Abhyankar       t[idx+1] -= v[5]*s1  +  v[6]*s2 +  v[7]*s3 +  v[8]*s4 +  v[9]*s5;
161632121132SShri Abhyankar       t[idx+2] -= v[10]*s1 + v[11]*s2 + v[12]*s3 + v[13]*s4 + v[14]*s5;
161732121132SShri Abhyankar       t[idx+3] -= v[15]*s1 + v[16]*s2 + v[17]*s3 + v[18]*s4 + v[19]*s5;
161832121132SShri Abhyankar       t[idx+4] -= v[20]*s1 + v[21]*s2 + v[22]*s3 + v[23]*s4 + v[24]*s5;
161932121132SShri Abhyankar       v += bs2;
162032121132SShri Abhyankar     }
162132121132SShri Abhyankar   }
162232121132SShri Abhyankar 
162332121132SShri Abhyankar   /* copy t into x according to permutation */
162432121132SShri Abhyankar   for(i=0;i<n;i++){
162532121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
162632121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
162732121132SShri Abhyankar     x[ir+4] = t[ii+4];
162832121132SShri Abhyankar   }
162932121132SShri Abhyankar 
163032121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
163132121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1632b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
163332121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
163432121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
163532121132SShri Abhyankar   PetscFunctionReturn(0);
163632121132SShri Abhyankar }
163732121132SShri Abhyankar 
163832121132SShri Abhyankar #undef __FUNCT__
163906e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6_inplace"
164006e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
1641f1af5d2fSBarry Smith {
1642f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1643f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
16446849ba73SBarry Smith   PetscErrorCode    ierr;
16455d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1646b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1647b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1648b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1649b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1650b3260449SShri Abhyankar   const PetscScalar *b;
1651f1af5d2fSBarry Smith 
1652f1af5d2fSBarry Smith   PetscFunctionBegin;
1653b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
16541ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1655f1af5d2fSBarry Smith   t  = a->solve_work;
1656f1af5d2fSBarry Smith 
1657f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1658f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1659f1af5d2fSBarry Smith 
1660f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1661f1af5d2fSBarry Smith   ii = 0;
1662f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1663f1af5d2fSBarry Smith     ic      = 6*c[i];
1664f1af5d2fSBarry Smith     t[ii]   = b[ic];
1665f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1666f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1667f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1668f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1669f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1670f1af5d2fSBarry Smith     ii += 6;
1671f1af5d2fSBarry Smith   }
1672f1af5d2fSBarry Smith 
1673f1af5d2fSBarry Smith   /* forward solve the U^T */
1674f1af5d2fSBarry Smith   idx = 0;
1675f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1676f1af5d2fSBarry Smith 
1677f1af5d2fSBarry Smith     v     = aa + 36*diag[i];
1678f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1679f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1680f1af5d2fSBarry Smith     x6    = t[5+idx];
1681f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
1682f1af5d2fSBarry Smith     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
1683f1af5d2fSBarry Smith     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
1684f1af5d2fSBarry Smith     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
1685f1af5d2fSBarry Smith     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
1686f1af5d2fSBarry Smith     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
1687f1af5d2fSBarry Smith     v += 36;
1688f1af5d2fSBarry Smith 
1689f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1690f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1691f1af5d2fSBarry Smith     while (nz--) {
1692f1af5d2fSBarry Smith       oidx = 6*(*vi++);
1693f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1694f1af5d2fSBarry Smith       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1695f1af5d2fSBarry Smith       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1696f1af5d2fSBarry Smith       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1697f1af5d2fSBarry Smith       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1698f1af5d2fSBarry Smith       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1699f1af5d2fSBarry Smith       v  += 36;
1700f1af5d2fSBarry Smith     }
1701f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1702f1af5d2fSBarry Smith     t[5+idx] = s6;
1703f1af5d2fSBarry Smith     idx += 6;
1704f1af5d2fSBarry Smith   }
1705f1af5d2fSBarry Smith   /* backward solve the L^T */
1706f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1707f1af5d2fSBarry Smith     v    = aa + 36*diag[i] - 36;
1708f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1709f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1710f1af5d2fSBarry Smith     idt  = 6*i;
1711f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1712f1af5d2fSBarry Smith     s6 = t[5+idt];
1713f1af5d2fSBarry Smith     while (nz--) {
1714f1af5d2fSBarry Smith       idx   = 6*(*vi--);
1715f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
1716f1af5d2fSBarry Smith       t[idx+1] -=  v[6]*s1 +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
1717f1af5d2fSBarry Smith       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
1718f1af5d2fSBarry Smith       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
1719f1af5d2fSBarry Smith       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
1720f1af5d2fSBarry Smith       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
1721f1af5d2fSBarry Smith       v -= 36;
1722f1af5d2fSBarry Smith     }
1723f1af5d2fSBarry Smith   }
1724f1af5d2fSBarry Smith 
1725f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1726f1af5d2fSBarry Smith   ii = 0;
1727f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1728f1af5d2fSBarry Smith     ir      = 6*r[i];
1729f1af5d2fSBarry Smith     x[ir]   = t[ii];
1730f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1731f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1732f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1733f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1734f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1735f1af5d2fSBarry Smith     ii += 6;
1736f1af5d2fSBarry Smith   }
1737f1af5d2fSBarry Smith 
1738f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1739f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1740b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
17411ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1742dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
1743f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1744f1af5d2fSBarry Smith }
1745f1af5d2fSBarry Smith 
17464a2ae208SSatish Balay #undef __FUNCT__
17474dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_6"
17484dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
174932121132SShri Abhyankar {
175032121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
175132121132SShri Abhyankar   PetscErrorCode    ierr;
175232121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1753b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
175432121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
175532121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1756b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1757b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1758b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*x,*t;
1759b3260449SShri Abhyankar   const PetscScalar *b;
176032121132SShri Abhyankar 
176132121132SShri Abhyankar   PetscFunctionBegin;
1762b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
176332121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
176432121132SShri Abhyankar   t = a->solve_work;
176532121132SShri Abhyankar 
176632121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
176732121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
176832121132SShri Abhyankar 
176932121132SShri Abhyankar   /* copy b into temp work space according to permutation */
177032121132SShri Abhyankar   for(i=0;i<n;i++){
177132121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
177232121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
177332121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];
177432121132SShri Abhyankar   }
177532121132SShri Abhyankar 
177632121132SShri Abhyankar   /* forward solve the U^T */
177732121132SShri Abhyankar   idx = 0;
177832121132SShri Abhyankar   for (i=0; i<n; i++) {
177932121132SShri Abhyankar     v     = aa + bs2*diag[i];
178032121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
178132121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
178232121132SShri Abhyankar     x6    = t[5+idx];
178332121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6;
178432121132SShri Abhyankar     s2 = v[6]*x1  +  v[7]*x2 +  v[8]*x3 +  v[9]*x4 + v[10]*x5 + v[11]*x6;
178532121132SShri Abhyankar     s3 = v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4 + v[16]*x5 + v[17]*x6;
178632121132SShri Abhyankar     s4 = v[18]*x1 + v[19]*x2 + v[20]*x3 + v[21]*x4 + v[22]*x5 + v[23]*x6;
178732121132SShri Abhyankar     s5 = v[24]*x1 + v[25]*x2 + v[26]*x3 + v[27]*x4 + v[28]*x5 + v[29]*x6;
178832121132SShri Abhyankar     s6 = v[30]*x1 + v[31]*x2 + v[32]*x3 + v[33]*x4 + v[34]*x5 + v[35]*x6;
178932121132SShri Abhyankar     v -= bs2;
179032121132SShri Abhyankar 
179132121132SShri Abhyankar     vi    = aj + diag[i] - 1;
179232121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
179332121132SShri Abhyankar     for(j=0;j>-nz;j--){
179432121132SShri Abhyankar       oidx = bs*vi[j];
179532121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
179632121132SShri Abhyankar       t[oidx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
179732121132SShri Abhyankar       t[oidx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
179832121132SShri Abhyankar       t[oidx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
179932121132SShri Abhyankar       t[oidx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
180032121132SShri Abhyankar       t[oidx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
180132121132SShri Abhyankar       v  -= bs2;
180232121132SShri Abhyankar     }
180332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
180432121132SShri Abhyankar     t[5+idx] = s6;
180532121132SShri Abhyankar     idx += bs;
180632121132SShri Abhyankar   }
180732121132SShri Abhyankar   /* backward solve the L^T */
180832121132SShri Abhyankar   for (i=n-1; i>=0; i--){
180932121132SShri Abhyankar     v    = aa + bs2*ai[i];
181032121132SShri Abhyankar     vi   = aj + ai[i];
181132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
181232121132SShri Abhyankar     idt  = bs*i;
181332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
181432121132SShri Abhyankar     s6   = t[5+idt];
181532121132SShri Abhyankar    for(j=0;j<nz;j++){
181632121132SShri Abhyankar       idx   = bs*vi[j];
181732121132SShri Abhyankar       t[idx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6;
181832121132SShri Abhyankar       t[idx+1] -= v[6]*s1  +  v[7]*s2 +  v[8]*s3 +  v[9]*s4 + v[10]*s5 + v[11]*s6;
181932121132SShri Abhyankar       t[idx+2] -= v[12]*s1 + v[13]*s2 + v[14]*s3 + v[15]*s4 + v[16]*s5 + v[17]*s6;
182032121132SShri Abhyankar       t[idx+3] -= v[18]*s1 + v[19]*s2 + v[20]*s3 + v[21]*s4 + v[22]*s5 + v[23]*s6;
182132121132SShri Abhyankar       t[idx+4] -= v[24]*s1 + v[25]*s2 + v[26]*s3 + v[27]*s4 + v[28]*s5 + v[29]*s6;
182232121132SShri Abhyankar       t[idx+5] -= v[30]*s1 + v[31]*s2 + v[32]*s3 + v[33]*s4 + v[34]*s5 + v[35]*s6;
182332121132SShri Abhyankar       v += bs2;
182432121132SShri Abhyankar     }
182532121132SShri Abhyankar   }
182632121132SShri Abhyankar 
182732121132SShri Abhyankar   /* copy t into x according to permutation */
182832121132SShri Abhyankar   for(i=0;i<n;i++){
182932121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
183032121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
183132121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];
183232121132SShri Abhyankar   }
183332121132SShri Abhyankar 
183432121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
183532121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1836b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
183732121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
183832121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
183932121132SShri Abhyankar   PetscFunctionReturn(0);
184032121132SShri Abhyankar }
184132121132SShri Abhyankar 
184232121132SShri Abhyankar #undef __FUNCT__
184306e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7_inplace"
184406e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
1845f1af5d2fSBarry Smith {
1846f1af5d2fSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
1847f1af5d2fSBarry Smith   IS                iscol=a->col,isrow=a->row;
18486849ba73SBarry Smith   PetscErrorCode    ierr;
18495d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
1850b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
1851b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,ii,ic,ir,oidx;
1852b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1853b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1854b3260449SShri Abhyankar   const PetscScalar *b;
1855f1af5d2fSBarry Smith 
1856f1af5d2fSBarry Smith   PetscFunctionBegin;
1857b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
18581ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
1859f1af5d2fSBarry Smith   t  = a->solve_work;
1860f1af5d2fSBarry Smith 
1861f1af5d2fSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
1862f1af5d2fSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
1863f1af5d2fSBarry Smith 
1864f1af5d2fSBarry Smith   /* copy the b into temp work space according to permutation */
1865f1af5d2fSBarry Smith   ii = 0;
1866f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1867f1af5d2fSBarry Smith     ic      = 7*c[i];
1868f1af5d2fSBarry Smith     t[ii]   = b[ic];
1869f1af5d2fSBarry Smith     t[ii+1] = b[ic+1];
1870f1af5d2fSBarry Smith     t[ii+2] = b[ic+2];
1871f1af5d2fSBarry Smith     t[ii+3] = b[ic+3];
1872f1af5d2fSBarry Smith     t[ii+4] = b[ic+4];
1873f1af5d2fSBarry Smith     t[ii+5] = b[ic+5];
1874f1af5d2fSBarry Smith     t[ii+6] = b[ic+6];
1875f1af5d2fSBarry Smith     ii += 7;
1876f1af5d2fSBarry Smith   }
1877f1af5d2fSBarry Smith 
1878f1af5d2fSBarry Smith   /* forward solve the U^T */
1879f1af5d2fSBarry Smith   idx = 0;
1880f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1881f1af5d2fSBarry Smith 
1882f1af5d2fSBarry Smith     v     = aa + 49*diag[i];
1883f1af5d2fSBarry Smith     /* multiply by the inverse of the block diagonal */
1884f1af5d2fSBarry Smith     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
1885f1af5d2fSBarry Smith     x6    = t[5+idx]; x7 = t[6+idx];
1886f1af5d2fSBarry Smith     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
1887f1af5d2fSBarry Smith     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
1888f1af5d2fSBarry Smith     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
1889f1af5d2fSBarry Smith     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
1890f1af5d2fSBarry Smith     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
1891f1af5d2fSBarry Smith     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
1892f1af5d2fSBarry Smith     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
1893f1af5d2fSBarry Smith     v += 49;
1894f1af5d2fSBarry Smith 
1895f1af5d2fSBarry Smith     vi    = aj + diag[i] + 1;
1896f1af5d2fSBarry Smith     nz    = ai[i+1] - diag[i] - 1;
1897f1af5d2fSBarry Smith     while (nz--) {
1898f1af5d2fSBarry Smith       oidx = 7*(*vi++);
1899f1af5d2fSBarry Smith       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1900f1af5d2fSBarry Smith       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1901f1af5d2fSBarry Smith       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1902f1af5d2fSBarry Smith       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1903f1af5d2fSBarry Smith       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1904f1af5d2fSBarry Smith       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1905f1af5d2fSBarry Smith       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1906f1af5d2fSBarry Smith       v  += 49;
1907f1af5d2fSBarry Smith     }
1908f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2; t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
1909f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
1910f1af5d2fSBarry Smith     idx += 7;
1911f1af5d2fSBarry Smith   }
1912f1af5d2fSBarry Smith   /* backward solve the L^T */
1913f1af5d2fSBarry Smith   for (i=n-1; i>=0; i--){
1914f1af5d2fSBarry Smith     v    = aa + 49*diag[i] - 49;
1915f1af5d2fSBarry Smith     vi   = aj + diag[i] - 1;
1916f1af5d2fSBarry Smith     nz   = diag[i] - ai[i];
1917f1af5d2fSBarry Smith     idt  = 7*i;
1918f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt]; s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
1919f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
1920f1af5d2fSBarry Smith     while (nz--) {
1921f1af5d2fSBarry Smith       idx   = 7*(*vi--);
1922f1af5d2fSBarry Smith       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
1923f1af5d2fSBarry Smith       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
1924f1af5d2fSBarry Smith       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
1925f1af5d2fSBarry Smith       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
1926f1af5d2fSBarry Smith       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
1927f1af5d2fSBarry Smith       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
1928f1af5d2fSBarry Smith       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
1929f1af5d2fSBarry Smith       v -= 49;
1930f1af5d2fSBarry Smith     }
1931f1af5d2fSBarry Smith   }
1932f1af5d2fSBarry Smith 
1933f1af5d2fSBarry Smith   /* copy t into x according to permutation */
1934f1af5d2fSBarry Smith   ii = 0;
1935f1af5d2fSBarry Smith   for (i=0; i<n; i++) {
1936f1af5d2fSBarry Smith     ir      = 7*r[i];
1937f1af5d2fSBarry Smith     x[ir]   = t[ii];
1938f1af5d2fSBarry Smith     x[ir+1] = t[ii+1];
1939f1af5d2fSBarry Smith     x[ir+2] = t[ii+2];
1940f1af5d2fSBarry Smith     x[ir+3] = t[ii+3];
1941f1af5d2fSBarry Smith     x[ir+4] = t[ii+4];
1942f1af5d2fSBarry Smith     x[ir+5] = t[ii+5];
1943f1af5d2fSBarry Smith     x[ir+6] = t[ii+6];
1944f1af5d2fSBarry Smith     ii += 7;
1945f1af5d2fSBarry Smith   }
1946f1af5d2fSBarry Smith 
1947f1af5d2fSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
1948f1af5d2fSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
1949b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
19501ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
1951dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
1952f1af5d2fSBarry Smith   PetscFunctionReturn(0);
1953f1af5d2fSBarry Smith }
195432121132SShri Abhyankar #undef __FUNCT__
19554dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_7"
19564dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
195732121132SShri Abhyankar {
195832121132SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
195932121132SShri Abhyankar   PetscErrorCode    ierr;
196032121132SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
1961b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
196232121132SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
196332121132SShri Abhyankar   PetscInt          nz,idx,idt,j,i,oidx,ii,ic,ir;
1964b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
1965b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
1966b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
1967b3260449SShri Abhyankar   const PetscScalar *b;
196832121132SShri Abhyankar 
196932121132SShri Abhyankar   PetscFunctionBegin;
1970b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
197132121132SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
197232121132SShri Abhyankar   t = a->solve_work;
197332121132SShri Abhyankar 
197432121132SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
197532121132SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
197632121132SShri Abhyankar 
197732121132SShri Abhyankar   /* copy b into temp work space according to permutation */
197832121132SShri Abhyankar   for(i=0;i<n;i++){
197932121132SShri Abhyankar     ii = bs*i; ic = bs*c[i];
198032121132SShri Abhyankar     t[ii] = b[ic]; t[ii+1] = b[ic+1]; t[ii+2] = b[ic+2]; t[ii+3] = b[ic+3];
198132121132SShri Abhyankar     t[ii+4] = b[ic+4];  t[ii+5] = b[ic+5];  t[ii+6] = b[ic+6];
198232121132SShri Abhyankar   }
198332121132SShri Abhyankar 
198432121132SShri Abhyankar   /* forward solve the U^T */
198532121132SShri Abhyankar   idx = 0;
198632121132SShri Abhyankar   for (i=0; i<n; i++) {
198732121132SShri Abhyankar     v     = aa + bs2*diag[i];
198832121132SShri Abhyankar     /* multiply by the inverse of the block diagonal */
198932121132SShri Abhyankar     x1    = t[idx];   x2 = t[1+idx]; x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
199032121132SShri Abhyankar     x6    = t[5+idx]; x7 = t[6+idx];
199132121132SShri Abhyankar     s1 = v[0]*x1  +  v[1]*x2 +  v[2]*x3 +  v[3]*x4 +  v[4]*x5 +  v[5]*x6 +  v[6]*x7;
199232121132SShri Abhyankar     s2 = v[7]*x1  +  v[8]*x2 +  v[9]*x3 + v[10]*x4 + v[11]*x5 + v[12]*x6 + v[13]*x7;
199332121132SShri Abhyankar     s3 = v[14]*x1 + v[15]*x2 + v[16]*x3 + v[17]*x4 + v[18]*x5 + v[19]*x6 + v[20]*x7;
199432121132SShri Abhyankar     s4 = v[21]*x1 + v[22]*x2 + v[23]*x3 + v[24]*x4 + v[25]*x5 + v[26]*x6 + v[27]*x7;
199532121132SShri Abhyankar     s5 = v[28]*x1 + v[29]*x2 + v[30]*x3 + v[31]*x4 + v[32]*x5 + v[33]*x6 + v[34]*x7;
199632121132SShri Abhyankar     s6 = v[35]*x1 + v[36]*x2 + v[37]*x3 + v[38]*x4 + v[39]*x5 + v[40]*x6 + v[41]*x7;
199732121132SShri Abhyankar     s7 = v[42]*x1 + v[43]*x2 + v[44]*x3 + v[45]*x4 + v[46]*x5 + v[47]*x6 + v[48]*x7;
199832121132SShri Abhyankar     v -= bs2;
199932121132SShri Abhyankar 
200032121132SShri Abhyankar     vi    = aj + diag[i] - 1;
200132121132SShri Abhyankar     nz    = diag[i] - diag[i+1] - 1;
200232121132SShri Abhyankar     for(j=0;j>-nz;j--){
200332121132SShri Abhyankar       oidx = bs*vi[j];
200432121132SShri Abhyankar       t[oidx]   -= v[0]*s1  +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
200532121132SShri Abhyankar       t[oidx+1] -= v[7]*s1  +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
200632121132SShri Abhyankar       t[oidx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
200732121132SShri Abhyankar       t[oidx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
200832121132SShri Abhyankar       t[oidx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
200932121132SShri Abhyankar       t[oidx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
201032121132SShri Abhyankar       t[oidx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
201132121132SShri Abhyankar       v  -= bs2;
201232121132SShri Abhyankar     }
201332121132SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;  t[2+idx] = s3;  t[3+idx] = s4; t[4+idx] =s5;
201432121132SShri Abhyankar     t[5+idx] = s6;  t[6+idx] = s7;
201532121132SShri Abhyankar     idx += bs;
201632121132SShri Abhyankar   }
201732121132SShri Abhyankar   /* backward solve the L^T */
201832121132SShri Abhyankar   for (i=n-1; i>=0; i--){
201932121132SShri Abhyankar     v    = aa + bs2*ai[i];
202032121132SShri Abhyankar     vi   = aj + ai[i];
202132121132SShri Abhyankar     nz   = ai[i+1] - ai[i];
202232121132SShri Abhyankar     idt  = bs*i;
202332121132SShri Abhyankar     s1   = t[idt];  s2 = t[1+idt];  s3 = t[2+idt];  s4 = t[3+idt]; s5 = t[4+idt];
202432121132SShri Abhyankar     s6   = t[5+idt];  s7 = t[6+idt];
202532121132SShri Abhyankar    for(j=0;j<nz;j++){
202632121132SShri Abhyankar       idx   = bs*vi[j];
202732121132SShri Abhyankar       t[idx]   -=  v[0]*s1 +  v[1]*s2 +  v[2]*s3 +  v[3]*s4 +  v[4]*s5 +  v[5]*s6 +  v[6]*s7;
202832121132SShri Abhyankar       t[idx+1] -=  v[7]*s1 +  v[8]*s2 +  v[9]*s3 + v[10]*s4 + v[11]*s5 + v[12]*s6 + v[13]*s7;
202932121132SShri Abhyankar       t[idx+2] -= v[14]*s1 + v[15]*s2 + v[16]*s3 + v[17]*s4 + v[18]*s5 + v[19]*s6 + v[20]*s7;
203032121132SShri Abhyankar       t[idx+3] -= v[21]*s1 + v[22]*s2 + v[23]*s3 + v[24]*s4 + v[25]*s5 + v[26]*s6 + v[27]*s7;
203132121132SShri Abhyankar       t[idx+4] -= v[28]*s1 + v[29]*s2 + v[30]*s3 + v[31]*s4 + v[32]*s5 + v[33]*s6 + v[34]*s7;
203232121132SShri Abhyankar       t[idx+5] -= v[35]*s1 + v[36]*s2 + v[37]*s3 + v[38]*s4 + v[39]*s5 + v[40]*s6 + v[41]*s7;
203332121132SShri Abhyankar       t[idx+6] -= v[42]*s1 + v[43]*s2 + v[44]*s3 + v[45]*s4 + v[46]*s5 + v[47]*s6 + v[48]*s7;
203432121132SShri Abhyankar       v += bs2;
203532121132SShri Abhyankar     }
203632121132SShri Abhyankar   }
203732121132SShri Abhyankar 
203832121132SShri Abhyankar   /* copy t into x according to permutation */
203932121132SShri Abhyankar   for(i=0;i<n;i++){
204032121132SShri Abhyankar     ii = bs*i;  ir = bs*r[i];
204132121132SShri Abhyankar     x[ir] = t[ii];  x[ir+1] = t[ii+1]; x[ir+2] = t[ii+2];  x[ir+3] = t[ii+3];
204232121132SShri Abhyankar     x[ir+4] = t[ii+4];  x[ir+5] = t[ii+5];  x[ir+6] = t[ii+6];
204332121132SShri Abhyankar   }
204432121132SShri Abhyankar 
204532121132SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
204632121132SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2047b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
204832121132SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
204932121132SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
205032121132SShri Abhyankar   PetscFunctionReturn(0);
205132121132SShri Abhyankar }
2052f1af5d2fSBarry Smith 
20534e2b4712SSatish Balay /* ----------------------------------------------------------- */
20544a2ae208SSatish Balay #undef __FUNCT__
205506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_N_inplace"
205606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
20574e2b4712SSatish Balay {
20584e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
20594e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
20606849ba73SBarry Smith   PetscErrorCode    ierr;
2061b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2062b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi;
2063b3260449SShri Abhyankar   PetscInt          i,nz;
2064b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
2065b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2066b3260449SShri Abhyankar   PetscScalar       *x,*s,*t,*ls;
2067b3260449SShri Abhyankar   const PetscScalar *b;
20684e2b4712SSatish Balay 
20694e2b4712SSatish Balay   PetscFunctionBegin;
2070b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
20711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2072f1af5d2fSBarry Smith   t  = a->solve_work;
20734e2b4712SSatish Balay 
20744e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
20754e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
20764e2b4712SSatish Balay 
20774e2b4712SSatish Balay   /* forward solve the lower triangular */
207887828ca2SBarry Smith   ierr = PetscMemcpy(t,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
20794e2b4712SSatish Balay   for (i=1; i<n; i++) {
20804e2b4712SSatish Balay     v   = aa + bs2*ai[i];
20814e2b4712SSatish Balay     vi  = aj + ai[i];
20824e2b4712SSatish Balay     nz  = a->diag[i] - ai[i];
2083f1af5d2fSBarry Smith     s = t + bs*i;
208487828ca2SBarry Smith     ierr = PetscMemcpy(s,b+bs*(*r++),bs*sizeof(PetscScalar));CHKERRQ(ierr);
20854e2b4712SSatish Balay     while (nz--) {
2086f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,s,v,t+bs*(*vi++));
20874e2b4712SSatish Balay       v += bs2;
20884e2b4712SSatish Balay     }
20894e2b4712SSatish Balay   }
20904e2b4712SSatish Balay   /* backward solve the upper triangular */
2091d0f46423SBarry Smith   ls = a->solve_work + A->cmap->n;
20924e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
20934e2b4712SSatish Balay     v   = aa + bs2*(a->diag[i] + 1);
20944e2b4712SSatish Balay     vi  = aj + a->diag[i] + 1;
20954e2b4712SSatish Balay     nz  = ai[i+1] - a->diag[i] - 1;
209687828ca2SBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
20974e2b4712SSatish Balay     while (nz--) {
2098f1af5d2fSBarry Smith       Kernel_v_gets_v_minus_A_times_w(bs,ls,v,t+bs*(*vi++));
20994e2b4712SSatish Balay       v += bs2;
21004e2b4712SSatish Balay     }
2101f1af5d2fSBarry Smith     Kernel_w_gets_A_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
210287828ca2SBarry Smith     ierr = PetscMemcpy(x + bs*(*c--),t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21034e2b4712SSatish Balay   }
21044e2b4712SSatish Balay 
21054e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21064e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2107b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21081ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2109dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
21104e2b4712SSatish Balay   PetscFunctionReturn(0);
21114e2b4712SSatish Balay }
21124e2b4712SSatish Balay 
21135c42ef9dSBarry Smith /* ----------------------------------------------------------- */
21145c42ef9dSBarry Smith #undef __FUNCT__
211506e38f1dSHong Zhang #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N_inplace"
211606e38f1dSHong Zhang PetscErrorCode MatSolveTranspose_SeqBAIJ_N_inplace(Mat A,Vec bb,Vec xx)
21175c42ef9dSBarry Smith {
21185c42ef9dSBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21195c42ef9dSBarry Smith   IS                iscol=a->col,isrow=a->row;
21205c42ef9dSBarry Smith   PetscErrorCode    ierr;
21215c42ef9dSBarry Smith   const PetscInt    *r,*c,*rout,*cout,*ai=a->i,*aj=a->j,*vi;
2122b3260449SShri Abhyankar   PetscInt          i,nz,j;
2123b3260449SShri Abhyankar   const PetscInt    n=a->mbs,bs=A->rmap->bs,bs2=a->bs2;
21245c42ef9dSBarry Smith   const MatScalar   *aa=a->a,*v;
21255c42ef9dSBarry Smith   PetscScalar       *x,*t,*ls;
21265c42ef9dSBarry Smith   const PetscScalar *b;
21275c42ef9dSBarry Smith   PetscFunctionBegin;
21285c42ef9dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21295c42ef9dSBarry Smith   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
21305c42ef9dSBarry Smith   t    = a->solve_work;
21315c42ef9dSBarry Smith 
21325c42ef9dSBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
21335c42ef9dSBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
21345c42ef9dSBarry Smith 
21355c42ef9dSBarry Smith   /* copy the b into temp work space according to permutation */
21365c42ef9dSBarry Smith   for (i=0; i<n; i++) {
21375c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
21385c42ef9dSBarry Smith       t[i*bs+j] = b[c[i]*bs+j];
21395c42ef9dSBarry Smith     }
21405c42ef9dSBarry Smith   }
21415c42ef9dSBarry Smith 
21425c42ef9dSBarry Smith 
21435c42ef9dSBarry Smith   /* forward solve the upper triangular transpose */
21445c42ef9dSBarry Smith   ls = a->solve_work + A->cmap->n;
21455c42ef9dSBarry Smith   for (i=0; i<n; i++){
21465c42ef9dSBarry Smith     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
21475c42ef9dSBarry Smith     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*a->diag[i],t+i*bs);
21485c42ef9dSBarry Smith     v   = aa + bs2*(a->diag[i] + 1);
21495c42ef9dSBarry Smith     vi  = aj + a->diag[i] + 1;
21505c42ef9dSBarry Smith     nz  = ai[i+1] - a->diag[i] - 1;
21515c42ef9dSBarry Smith     while (nz--) {
21525c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
21535c42ef9dSBarry Smith       v += bs2;
21545c42ef9dSBarry Smith     }
21555c42ef9dSBarry Smith   }
21565c42ef9dSBarry Smith 
21575c42ef9dSBarry Smith   /* backward solve the lower triangular transpose */
21585c42ef9dSBarry Smith   for (i=n-1; i>=0; i--) {
21595c42ef9dSBarry Smith     v   = aa + bs2*ai[i];
21605c42ef9dSBarry Smith     vi  = aj + ai[i];
21615c42ef9dSBarry Smith     nz  = a->diag[i] - ai[i];
21625c42ef9dSBarry Smith     while (nz--) {
21635c42ef9dSBarry Smith       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(*vi++),v,t+i*bs);
21645c42ef9dSBarry Smith       v += bs2;
21655c42ef9dSBarry Smith     }
21665c42ef9dSBarry Smith   }
21675c42ef9dSBarry Smith 
21685c42ef9dSBarry Smith   /* copy t into x according to permutation */
21695c42ef9dSBarry Smith   for (i=0; i<n; i++) {
21705c42ef9dSBarry Smith     for (j=0; j<bs; j++) {
21715c42ef9dSBarry Smith       x[bs*r[i]+j]   = t[bs*i+j];
21725c42ef9dSBarry Smith     }
21735c42ef9dSBarry Smith   }
21745c42ef9dSBarry Smith 
21755c42ef9dSBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
21765c42ef9dSBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
21775c42ef9dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
21785c42ef9dSBarry Smith   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
21795c42ef9dSBarry Smith   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
21805c42ef9dSBarry Smith   PetscFunctionReturn(0);
21815c42ef9dSBarry Smith }
21825c42ef9dSBarry Smith 
21834a2ae208SSatish Balay #undef __FUNCT__
21844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolveTranspose_SeqBAIJ_N"
21854dd39f65SShri Abhyankar PetscErrorCode MatSolveTranspose_SeqBAIJ_N(Mat A,Vec bb,Vec xx)
21868499736aSShri Abhyankar {
21878499736aSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
21888499736aSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
21898499736aSShri Abhyankar   PetscErrorCode    ierr;
2190b3260449SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2191b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*vi,*diag=a->diag;
2192b3260449SShri Abhyankar   PetscInt          i,j,nz;
2193b3260449SShri Abhyankar   const PetscInt    bs=A->rmap->bs,bs2=a->bs2;
21948499736aSShri Abhyankar   const MatScalar   *aa=a->a,*v;
21958499736aSShri Abhyankar   PetscScalar       *x,*t,*ls;
21968499736aSShri Abhyankar   const PetscScalar *b;
2197b3260449SShri Abhyankar 
21988499736aSShri Abhyankar   PetscFunctionBegin;
21998499736aSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22008499736aSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22018499736aSShri Abhyankar   t    = a->solve_work;
22028499736aSShri Abhyankar 
22038499736aSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
22048499736aSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
22058499736aSShri Abhyankar 
22068499736aSShri Abhyankar   /* copy the b into temp work space according to permutation */
22078499736aSShri Abhyankar   for (i=0; i<n; i++) {
22088499736aSShri Abhyankar     for (j=0; j<bs; j++) {
22098499736aSShri Abhyankar       t[i*bs+j] = b[c[i]*bs+j];
22108499736aSShri Abhyankar     }
22118499736aSShri Abhyankar   }
22128499736aSShri Abhyankar 
22138499736aSShri Abhyankar 
22148499736aSShri Abhyankar   /* forward solve the upper triangular transpose */
22158499736aSShri Abhyankar   ls = a->solve_work + A->cmap->n;
22168499736aSShri Abhyankar   for (i=0; i<n; i++){
22178499736aSShri Abhyankar     ierr = PetscMemcpy(ls,t+i*bs,bs*sizeof(PetscScalar));CHKERRQ(ierr);
22188499736aSShri Abhyankar     Kernel_w_gets_transA_times_v(bs,ls,aa+bs2*diag[i],t+i*bs);
22198499736aSShri Abhyankar     v   = aa + bs2*(diag[i] - 1);
22208499736aSShri Abhyankar     vi  = aj + diag[i] - 1;
22218499736aSShri Abhyankar     nz  = diag[i] - diag[i+1] - 1;
22228499736aSShri Abhyankar     for(j=0;j>-nz;j--){
22238499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
22248499736aSShri Abhyankar       v -= bs2;
22258499736aSShri Abhyankar     }
22268499736aSShri Abhyankar   }
22278499736aSShri Abhyankar 
22288499736aSShri Abhyankar   /* backward solve the lower triangular transpose */
22298499736aSShri Abhyankar   for (i=n-1; i>=0; i--) {
22308499736aSShri Abhyankar     v   = aa + bs2*ai[i];
22318499736aSShri Abhyankar     vi  = aj + ai[i];
22328499736aSShri Abhyankar     nz  = ai[i+1] - ai[i];
22338499736aSShri Abhyankar     for(j=0;j<nz;j++){
22348499736aSShri Abhyankar       Kernel_v_gets_v_minus_transA_times_w(bs,t+bs*(vi[j]),v,t+i*bs);
22358499736aSShri Abhyankar       v += bs2;
22368499736aSShri Abhyankar     }
22378499736aSShri Abhyankar   }
22388499736aSShri Abhyankar 
22398499736aSShri Abhyankar   /* copy t into x according to permutation */
22408499736aSShri Abhyankar   for (i=0; i<n; i++) {
22418499736aSShri Abhyankar     for (j=0; j<bs; j++) {
22428499736aSShri Abhyankar       x[bs*r[i]+j]   = t[bs*i+j];
22438499736aSShri Abhyankar     }
22448499736aSShri Abhyankar   }
22458499736aSShri Abhyankar 
22468499736aSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
22478499736aSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
22488499736aSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22498499736aSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
22508499736aSShri Abhyankar   ierr = PetscLogFlops(2.0*(a->bs2)*(a->nz) - A->rmap->bs*A->cmap->n);CHKERRQ(ierr);
22518499736aSShri Abhyankar   PetscFunctionReturn(0);
22528499736aSShri Abhyankar }
22538499736aSShri Abhyankar 
2254832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing all the columns   of the block at once */
225529a97285SShri Abhyankar 
22562b0b2ea7SShri Abhyankar #undef __FUNCT__
2257832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver2"
2258832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver2(Mat A,Vec bb,Vec xx)
22592b0b2ea7SShri Abhyankar {
22602b0b2ea7SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
22612b0b2ea7SShri Abhyankar   PetscErrorCode    ierr;
2262b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
22630fa040f9SShri Abhyankar   PetscInt          i,nz,idx,idt,m;
22640b68f018SBarry Smith   const MatScalar   *aa=a->a,*v;
22652b0b2ea7SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15;
22662b0b2ea7SShri Abhyankar   PetscScalar       x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15;
22670fa040f9SShri Abhyankar   PetscScalar       *x;
22680b68f018SBarry Smith   const PetscScalar *b;
22692b0b2ea7SShri Abhyankar 
22702b0b2ea7SShri Abhyankar   PetscFunctionBegin;
22710b68f018SBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
22722b0b2ea7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
22732b0b2ea7SShri Abhyankar 
22742b0b2ea7SShri Abhyankar   /* forward solve the lower triangular */
227529a97285SShri Abhyankar   idx    = 0;
22760fa040f9SShri Abhyankar   x[0]  = b[idx];    x[1]  = b[1+idx];  x[2]  = b[2+idx];  x[3]  = b[3+idx];  x[4]  = b[4+idx];
22770fa040f9SShri Abhyankar   x[5]  = b[5+idx];  x[6]  = b[6+idx];  x[7]  = b[7+idx];  x[8]  = b[8+idx];  x[9]  = b[9+idx];
22780fa040f9SShri Abhyankar   x[10] = b[10+idx]; x[11] = b[11+idx]; x[12] = b[12+idx]; x[13] = b[13+idx]; x[14] = b[14+idx];
22792b0b2ea7SShri Abhyankar 
22802b0b2ea7SShri Abhyankar   for (i=1; i<n; i++) {
22812b0b2ea7SShri Abhyankar     v     = aa + bs2*ai[i];
22822b0b2ea7SShri Abhyankar     vi    = aj + ai[i];
22832b0b2ea7SShri Abhyankar     nz    = ai[i+1] - ai[i];
22840fa040f9SShri Abhyankar     idt   = bs*i;
22850fa040f9SShri Abhyankar     s1   = b[idt];    s2  = b[1+idt];  s3  = b[2+idt];  s4  = b[3+idt];  s5  = b[4+idt];
22860fa040f9SShri Abhyankar     s6   = b[5+idt];  s7  = b[6+idt];  s8  = b[7+idt];  s9  = b[8+idt];  s10 = b[9+idt];
22870fa040f9SShri Abhyankar     s11  = b[10+idt]; s12 = b[11+idt]; s13 = b[12+idt]; s14 = b[13+idt]; s15 = b[14+idt];
22882b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
22892b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
22900fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
22910fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
22920fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
22932b0b2ea7SShri Abhyankar 
22940b8f6341SShri Abhyankar 
22952b0b2ea7SShri Abhyankar       s1 -=  v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
22962b0b2ea7SShri Abhyankar       s2 -=  v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
22972b0b2ea7SShri Abhyankar       s3 -=  v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
22982b0b2ea7SShri Abhyankar       s4 -=  v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
22992b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
23002b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
23012b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
23022b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
23032b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
23042b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
23052b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
23062b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
23072b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
23082b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
23092b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
23102b0b2ea7SShri Abhyankar 
23112b0b2ea7SShri Abhyankar       v += bs2;
23122b0b2ea7SShri Abhyankar     }
23130fa040f9SShri Abhyankar     x[idt]    = s1;  x[1+idt]  = s2;  x[2+idt]  = s3;  x[3+idt]  = s4;  x[4+idt]  = s5;
23140fa040f9SShri Abhyankar     x[5+idt]  = s6;  x[6+idt]  = s7;  x[7+idt]  = s8;  x[8+idt]  = s9;  x[9+idt]  = s10;
23150fa040f9SShri Abhyankar     x[10+idt] = s11; x[11+idt] = s12; x[12+idt] = s13; x[13+idt] = s14; x[14+idt] = s15;
23162b0b2ea7SShri Abhyankar 
23172b0b2ea7SShri Abhyankar   }
23182b0b2ea7SShri Abhyankar   /* backward solve the upper triangular */
23192b0b2ea7SShri Abhyankar   for (i=n-1; i>=0; i--){
23202b0b2ea7SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
23212b0b2ea7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
23222b0b2ea7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
23232b0b2ea7SShri Abhyankar     idt  = bs*i;
23240fa040f9SShri Abhyankar     s1   = x[idt];     s2  = x[1+idt];  s3  = x[2+idt];  s4  = x[3+idt];  s5  = x[4+idt];
23250fa040f9SShri Abhyankar     s6   = x[5+idt];   s7  = x[6+idt];  s8  = x[7+idt];  s9  = x[8+idt];  s10 = x[9+idt];
23260fa040f9SShri Abhyankar     s11  = x[10+idt]; s12  = x[11+idt]; s13 = x[12+idt]; s14 = x[13+idt]; s15 = x[14+idt];
23272b0b2ea7SShri Abhyankar 
23282b0b2ea7SShri Abhyankar     for(m=0;m<nz;m++){
23292b0b2ea7SShri Abhyankar       idx   = bs*vi[m];
23300fa040f9SShri Abhyankar       x1   = x[idx];     x2  = x[1+idx];  x3  = x[2+idx];  x4  = x[3+idx];  x5  = x[4+idx];
23310fa040f9SShri Abhyankar       x6   = x[5+idx];   x7  = x[6+idx];  x8  = x[7+idx];  x9  = x[8+idx];  x10 = x[9+idx];
23320fa040f9SShri Abhyankar       x11  = x[10+idx]; x12  = x[11+idx]; x13 = x[12+idx]; x14 = x[13+idx]; x15 = x[14+idx];
23332b0b2ea7SShri Abhyankar 
23342b0b2ea7SShri Abhyankar       s1  -= v[0]*x1  + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7  + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
23352b0b2ea7SShri Abhyankar       s2  -= v[1]*x1  + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7  + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
23362b0b2ea7SShri Abhyankar       s3  -= v[2]*x1  + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7  + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
23372b0b2ea7SShri Abhyankar       s4  -= v[3]*x1  + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7  + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
23382b0b2ea7SShri Abhyankar       s5  -= v[4]*x1  + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7  + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
23392b0b2ea7SShri Abhyankar       s6  -= v[5]*x1  + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7  + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
23402b0b2ea7SShri Abhyankar       s7  -= v[6]*x1  + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7  + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
23412b0b2ea7SShri Abhyankar       s8  -= v[7]*x1  + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7  + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
23422b0b2ea7SShri Abhyankar       s9  -= v[8]*x1  + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7  + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
23432b0b2ea7SShri Abhyankar       s10 -= v[9]*x1  + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7  + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
23442b0b2ea7SShri Abhyankar       s11 -= v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
23452b0b2ea7SShri Abhyankar       s12 -= v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
23462b0b2ea7SShri Abhyankar       s13 -= v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
23472b0b2ea7SShri Abhyankar       s14 -= v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
23482b0b2ea7SShri Abhyankar       s15 -= v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
23492b0b2ea7SShri Abhyankar 
23502b0b2ea7SShri Abhyankar       v += bs2;
23512b0b2ea7SShri Abhyankar     }
23522b0b2ea7SShri Abhyankar 
23530fa040f9SShri Abhyankar     x[idt] = v[0]*s1  + v[15]*s2 + v[30]*s3 + v[45]*s4 + v[60]*s5 + v[75]*s6 + v[90]*s7  + v[105]*s8 + v[120]*s9 + v[135]*s10 + v[150]*s11 + v[165]*s12 + v[180]*s13 + v[195]*s14 + v[210]*s15;
23540fa040f9SShri Abhyankar     x[1+idt] = v[1]*s1  + v[16]*s2 + v[31]*s3 + v[46]*s4 + v[61]*s5 + v[76]*s6 + v[91]*s7  + v[106]*s8 + v[121]*s9 + v[136]*s10 + v[151]*s11 + v[166]*s12 + v[181]*s13 + v[196]*s14 + v[211]*s15;
23550fa040f9SShri Abhyankar     x[2+idt] = v[2]*s1  + v[17]*s2 + v[32]*s3 + v[47]*s4 + v[62]*s5 + v[77]*s6 + v[92]*s7  + v[107]*s8 + v[122]*s9 + v[137]*s10 + v[152]*s11 + v[167]*s12 + v[182]*s13 + v[197]*s14 + v[212]*s15;
23560fa040f9SShri Abhyankar     x[3+idt] = v[3]*s1  + v[18]*s2 + v[33]*s3 + v[48]*s4 + v[63]*s5 + v[78]*s6 + v[93]*s7  + v[108]*s8 + v[123]*s9 + v[138]*s10 + v[153]*s11 + v[168]*s12 + v[183]*s13 + v[198]*s14 + v[213]*s15;
23570fa040f9SShri Abhyankar     x[4+idt] = v[4]*s1  + v[19]*s2 + v[34]*s3 + v[49]*s4 + v[64]*s5 + v[79]*s6 + v[94]*s7  + v[109]*s8 + v[124]*s9 + v[139]*s10 + v[154]*s11 + v[169]*s12 + v[184]*s13 + v[199]*s14 + v[214]*s15;
23580fa040f9SShri Abhyankar     x[5+idt] = v[5]*s1  + v[20]*s2 + v[35]*s3 + v[50]*s4 + v[65]*s5 + v[80]*s6 + v[95]*s7  + v[110]*s8 + v[125]*s9 + v[140]*s10 + v[155]*s11 + v[170]*s12 + v[185]*s13 + v[200]*s14 + v[215]*s15;
23590fa040f9SShri Abhyankar     x[6+idt] = v[6]*s1  + v[21]*s2 + v[36]*s3 + v[51]*s4 + v[66]*s5 + v[81]*s6 + v[96]*s7  + v[111]*s8 + v[126]*s9 + v[141]*s10 + v[156]*s11 + v[171]*s12 + v[186]*s13 + v[201]*s14 + v[216]*s15;
23600fa040f9SShri Abhyankar     x[7+idt] = v[7]*s1  + v[22]*s2 + v[37]*s3 + v[52]*s4 + v[67]*s5 + v[82]*s6 + v[97]*s7  + v[112]*s8 + v[127]*s9 + v[142]*s10 + v[157]*s11 + v[172]*s12 + v[187]*s13 + v[202]*s14 + v[217]*s15;
23610fa040f9SShri Abhyankar     x[8+idt] = v[8]*s1  + v[23]*s2 + v[38]*s3 + v[53]*s4 + v[68]*s5 + v[83]*s6 + v[98]*s7  + v[113]*s8 + v[128]*s9 + v[143]*s10 + v[158]*s11 + v[173]*s12 + v[188]*s13 + v[203]*s14 + v[218]*s15;
23620fa040f9SShri Abhyankar     x[9+idt] = v[9]*s1  + v[24]*s2 + v[39]*s3 + v[54]*s4 + v[69]*s5 + v[84]*s6 + v[99]*s7  + v[114]*s8 + v[129]*s9 + v[144]*s10 + v[159]*s11 + v[174]*s12 + v[189]*s13 + v[204]*s14 + v[219]*s15;
23630fa040f9SShri Abhyankar     x[10+idt] = v[10]*s1 + v[25]*s2 + v[40]*s3 + v[55]*s4 + v[70]*s5 + v[85]*s6 + v[100]*s7 + v[115]*s8 + v[130]*s9 + v[145]*s10 + v[160]*s11 + v[175]*s12 + v[190]*s13 + v[205]*s14 + v[220]*s15;
23640fa040f9SShri Abhyankar     x[11+idt] = v[11]*s1 + v[26]*s2 + v[41]*s3 + v[56]*s4 + v[71]*s5 + v[86]*s6 + v[101]*s7 + v[116]*s8 + v[131]*s9 + v[146]*s10 + v[161]*s11 + v[176]*s12 + v[191]*s13 + v[206]*s14 + v[221]*s15;
23650fa040f9SShri Abhyankar     x[12+idt] = v[12]*s1 + v[27]*s2 + v[42]*s3 + v[57]*s4 + v[72]*s5 + v[87]*s6 + v[102]*s7 + v[117]*s8 + v[132]*s9 + v[147]*s10 + v[162]*s11 + v[177]*s12 + v[192]*s13 + v[207]*s14 + v[222]*s15;
23660fa040f9SShri Abhyankar     x[13+idt] = v[13]*s1 + v[28]*s2 + v[43]*s3 + v[58]*s4 + v[73]*s5 + v[88]*s6 + v[103]*s7 + v[118]*s8 + v[133]*s9 + v[148]*s10 + v[163]*s11 + v[178]*s12 + v[193]*s13 + v[208]*s14 + v[223]*s15;
23670fa040f9SShri Abhyankar     x[14+idt] = v[14]*s1 + v[29]*s2 + v[44]*s3 + v[59]*s4 + v[74]*s5 + v[89]*s6 + v[104]*s7 + v[119]*s8 + v[134]*s9 + v[149]*s10 + v[164]*s11 + v[179]*s12 + v[194]*s13 + v[209]*s14 + v[224]*s15;
23682b0b2ea7SShri Abhyankar 
23692b0b2ea7SShri Abhyankar   }
23702b0b2ea7SShri Abhyankar 
23710b68f018SBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23722b0b2ea7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
23732b0b2ea7SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
23742b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
23752b0b2ea7SShri Abhyankar }
23762b0b2ea7SShri Abhyankar 
2377832cc040SShri Abhyankar /* bs = 15 for PFLOTRAN. Block operations are done by accessing one column at at time */
2378832cc040SShri Abhyankar /* Default MatSolve for block size 15 */
2379832cc040SShri Abhyankar 
23808499736aSShri Abhyankar #undef __FUNCT__
2381832cc040SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_15_NaturalOrdering_ver1"
2382832cc040SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_15_NaturalOrdering_ver1(Mat A,Vec bb,Vec xx)
23830b8f6341SShri Abhyankar {
23840b8f6341SShri Abhyankar   Mat_SeqBAIJ      *a=(Mat_SeqBAIJ *)A->data;
23850b8f6341SShri Abhyankar   PetscErrorCode    ierr;
23860b8f6341SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*adiag=a->diag,*vi,bs=A->rmap->bs,bs2=a->bs2;
23870fa040f9SShri Abhyankar   PetscInt          i,k,nz,kdx,idx,idt,m;
23880b8f6341SShri Abhyankar   const MatScalar   *aa=a->a,*v;
23890b8f6341SShri Abhyankar   PetscScalar       s[15];
23900fa040f9SShri Abhyankar   PetscScalar       *x;
23910b8f6341SShri Abhyankar   const PetscScalar *b;
23920b8f6341SShri Abhyankar 
23930b8f6341SShri Abhyankar   PetscFunctionBegin;
23940b8f6341SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
23950b8f6341SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
23960b8f6341SShri Abhyankar 
23970b8f6341SShri Abhyankar   /* forward solve the lower triangular */
2398832cc040SShri Abhyankar   for (i=0; i<n; i++) {
23990b8f6341SShri Abhyankar     v     = aa + bs2*ai[i];
24000b8f6341SShri Abhyankar     vi    = aj + ai[i];
24010b8f6341SShri Abhyankar     nz    = ai[i+1] - ai[i];
24020fa040f9SShri Abhyankar     idt   = bs*i;
2403832cc040SShri Abhyankar     x[idt]   = b[idt];    x[1+idt]  = b[1+idt];  x[2+idt]  = b[2+idt];  x[3+idt]  = b[3+idt];  x[4+idt]  = b[4+idt];
2404832cc040SShri Abhyankar     x[5+idt]   = b[5+idt];  x[6+idt]  = b[6+idt];  x[7+idt]  = b[7+idt];  x[8+idt]  = b[8+idt];  x[9+idt] = b[9+idt];
2405832cc040SShri Abhyankar     x[10+idt]  = b[10+idt]; x[11+idt] = b[11+idt]; x[12+idt] = b[12+idt]; x[13+idt] = b[13+idt]; x[14+idt] = b[14+idt];
24060b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
24070b8f6341SShri Abhyankar       idx   = bs*vi[m];
24080b8f6341SShri Abhyankar       for(k=0;k<15;k++){
24090fa040f9SShri Abhyankar 	kdx = k + idx;
2410832cc040SShri Abhyankar 	x[idt]    -= v[0]*x[kdx];
2411832cc040SShri Abhyankar 	x[1+idt]  -= v[1]*x[kdx];
2412832cc040SShri Abhyankar 	x[2+idt]  -= v[2]*x[kdx];
2413832cc040SShri Abhyankar         x[3+idt]  -= v[3]*x[kdx];
2414832cc040SShri Abhyankar 	x[4+idt]  -= v[4]*x[kdx];
2415832cc040SShri Abhyankar 	x[5+idt]  -= v[5]*x[kdx];
2416832cc040SShri Abhyankar 	x[6+idt]  -= v[6]*x[kdx];
2417832cc040SShri Abhyankar         x[7+idt]  -= v[7]*x[kdx];
2418832cc040SShri Abhyankar 	x[8+idt]  -= v[8]*x[kdx];
2419832cc040SShri Abhyankar 	x[9+idt]  -= v[9]*x[kdx];
2420832cc040SShri Abhyankar 	x[10+idt] -= v[10]*x[kdx];
2421832cc040SShri Abhyankar         x[11+idt] -= v[11]*x[kdx];
2422832cc040SShri Abhyankar 	x[12+idt] -= v[12]*x[kdx];
2423832cc040SShri Abhyankar 	x[13+idt] -= v[13]*x[kdx];
2424832cc040SShri Abhyankar 	x[14+idt] -= v[14]*x[kdx];
24250b8f6341SShri Abhyankar 	v += 15;
24260b8f6341SShri Abhyankar       }
24270b8f6341SShri Abhyankar     }
24280b8f6341SShri Abhyankar   }
24290b8f6341SShri Abhyankar   /* backward solve the upper triangular */
24300b8f6341SShri Abhyankar   for (i=n-1; i>=0; i--){
24310b8f6341SShri Abhyankar     v    = aa + bs2*(adiag[i+1]+1);
24320b8f6341SShri Abhyankar     vi   = aj + adiag[i+1]+1;
24330b8f6341SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
24340b8f6341SShri Abhyankar     idt  = bs*i;
24350fa040f9SShri Abhyankar     s[0]   = x[idt];    s[1]  = x[1+idt];  s[2]  = x[2+idt];  s[3]  = x[3+idt];  s[4]  = x[4+idt];
24360fa040f9SShri Abhyankar     s[5]   = x[5+idt];  s[6]  = x[6+idt];  s[7]  = x[7+idt];  s[8]  = x[8+idt];  s[9]  = x[9+idt];
24370fa040f9SShri Abhyankar     s[10]  = x[10+idt]; s[11] = x[11+idt]; s[12] = x[12+idt]; s[13] = x[13+idt]; s[14] = x[14+idt];
24380b8f6341SShri Abhyankar 
24390b8f6341SShri Abhyankar     for(m=0;m<nz;m++){
24400b8f6341SShri Abhyankar       idx   = bs*vi[m];
24410b8f6341SShri Abhyankar       for(k=0;k<15;k++){
24420fa040f9SShri Abhyankar 	kdx = k + idx;
24430fa040f9SShri Abhyankar 	s[0]  -= v[0]*x[kdx];
24440fa040f9SShri Abhyankar 	s[1]  -= v[1]*x[kdx];
24450fa040f9SShri Abhyankar 	s[2]  -= v[2]*x[kdx];
24460fa040f9SShri Abhyankar         s[3]  -= v[3]*x[kdx];
24470fa040f9SShri Abhyankar 	s[4]  -= v[4]*x[kdx];
24480fa040f9SShri Abhyankar 	s[5]  -= v[5]*x[kdx];
24490fa040f9SShri Abhyankar 	s[6]  -= v[6]*x[kdx];
24500fa040f9SShri Abhyankar         s[7]  -= v[7]*x[kdx];
24510fa040f9SShri Abhyankar 	s[8]  -= v[8]*x[kdx];
24520fa040f9SShri Abhyankar 	s[9]  -= v[9]*x[kdx];
24530fa040f9SShri Abhyankar 	s[10] -= v[10]*x[kdx];
24540fa040f9SShri Abhyankar         s[11] -= v[11]*x[kdx];
24550fa040f9SShri Abhyankar 	s[12] -= v[12]*x[kdx];
24560fa040f9SShri Abhyankar 	s[13] -= v[13]*x[kdx];
24570fa040f9SShri Abhyankar 	s[14] -= v[14]*x[kdx];
24580b8f6341SShri Abhyankar 	v += 15;
24590b8f6341SShri Abhyankar       }
24600b8f6341SShri Abhyankar     }
24610fa040f9SShri Abhyankar     ierr = PetscMemzero(x+idt,bs*sizeof(MatScalar));CHKERRQ(ierr);
24620b8f6341SShri Abhyankar     for(k=0;k<15;k++){
24630fa040f9SShri Abhyankar       x[idt]    += v[0]*s[k];
24640fa040f9SShri Abhyankar       x[1+idt]  += v[1]*s[k];
24650fa040f9SShri Abhyankar       x[2+idt]  += v[2]*s[k];
24660fa040f9SShri Abhyankar       x[3+idt]  += v[3]*s[k];
24670fa040f9SShri Abhyankar       x[4+idt]  += v[4]*s[k];
24680fa040f9SShri Abhyankar       x[5+idt]  += v[5]*s[k];
24690fa040f9SShri Abhyankar       x[6+idt]  += v[6]*s[k];
24700fa040f9SShri Abhyankar       x[7+idt]  += v[7]*s[k];
24710fa040f9SShri Abhyankar       x[8+idt]  += v[8]*s[k];
24720fa040f9SShri Abhyankar       x[9+idt]  += v[9]*s[k];
24730fa040f9SShri Abhyankar       x[10+idt] += v[10]*s[k];
24740fa040f9SShri Abhyankar       x[11+idt] += v[11]*s[k];
24750fa040f9SShri Abhyankar       x[12+idt] += v[12]*s[k];
24760fa040f9SShri Abhyankar       x[13+idt] += v[13]*s[k];
24770fa040f9SShri Abhyankar       x[14+idt] += v[14]*s[k];
24780b8f6341SShri Abhyankar       v += 15;
24790b8f6341SShri Abhyankar     }
24800b8f6341SShri Abhyankar   }
24810b8f6341SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
24820b8f6341SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
24830b8f6341SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
24840b8f6341SShri Abhyankar   PetscFunctionReturn(0);
24850b8f6341SShri Abhyankar }
24860b8f6341SShri Abhyankar 
24870b8f6341SShri Abhyankar 
24880b8f6341SShri Abhyankar #undef __FUNCT__
248906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_inplace"
249006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_inplace(Mat A,Vec bb,Vec xx)
24914e2b4712SSatish Balay {
24924e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
24934e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
24946849ba73SBarry Smith   PetscErrorCode    ierr;
2495b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j;
2496b3260449SShri Abhyankar   const PetscInt    *rout,*cout,*diag = a->diag,*vi,n=a->mbs;
2497b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2498b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2499b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2500b3260449SShri Abhyankar   const PetscScalar *b;
25014e2b4712SSatish Balay 
25024e2b4712SSatish Balay   PetscFunctionBegin;
2503b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25041ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2505f1af5d2fSBarry Smith   t  = a->solve_work;
25064e2b4712SSatish Balay 
25074e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
25084e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
25094e2b4712SSatish Balay 
25104e2b4712SSatish Balay   /* forward solve the lower triangular */
25114e2b4712SSatish Balay   idx    = 7*(*r++);
2512f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2513f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
2514f1af5d2fSBarry Smith   t[5] = b[5+idx]; t[6] = b[6+idx];
25154e2b4712SSatish Balay 
25164e2b4712SSatish Balay   for (i=1; i<n; i++) {
25174e2b4712SSatish Balay     v     = aa + 49*ai[i];
25184e2b4712SSatish Balay     vi    = aj + ai[i];
25194e2b4712SSatish Balay     nz    = diag[i] - ai[i];
25204e2b4712SSatish Balay     idx   = 7*(*r++);
2521f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2522f1af5d2fSBarry Smith     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
25234e2b4712SSatish Balay     while (nz--) {
25244e2b4712SSatish Balay       idx   = 7*(*vi++);
2525f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
2526f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
2527f1af5d2fSBarry Smith       x6    = t[5+idx];x7 = t[6+idx];
2528f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2529f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2530f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2531f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2532f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2533f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2534f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
25354e2b4712SSatish Balay       v += 49;
25364e2b4712SSatish Balay     }
25374e2b4712SSatish Balay     idx = 7*i;
2538f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2539f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
2540f1af5d2fSBarry Smith     t[5+idx] = s6;t[6+idx] = s7;
25414e2b4712SSatish Balay   }
25424e2b4712SSatish Balay   /* backward solve the upper triangular */
25434e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
25444e2b4712SSatish Balay     v    = aa + 49*diag[i] + 49;
25454e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
25464e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
25474e2b4712SSatish Balay     idt  = 7*i;
2548f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2549f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
2550f1af5d2fSBarry Smith     s6 = t[5+idt];s7 = t[6+idt];
25514e2b4712SSatish Balay     while (nz--) {
25524e2b4712SSatish Balay       idx   = 7*(*vi++);
2553f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2554f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
2555f1af5d2fSBarry Smith       x6    = t[5+idx]; x7 = t[6+idx];
2556f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2557f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2558f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2559f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2560f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2561f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2562f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
25634e2b4712SSatish Balay       v += 49;
25644e2b4712SSatish Balay     }
25654e2b4712SSatish Balay     idc = 7*(*c--);
25664e2b4712SSatish Balay     v   = aa + 49*diag[i];
2567f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
2568f1af5d2fSBarry Smith                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
2569f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
2570f1af5d2fSBarry Smith                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
2571f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
2572f1af5d2fSBarry Smith                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
2573f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
2574f1af5d2fSBarry Smith                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
2575f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
2576f1af5d2fSBarry Smith                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
2577f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
2578f1af5d2fSBarry Smith                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
2579f1af5d2fSBarry Smith     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
2580f1af5d2fSBarry Smith                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
25814e2b4712SSatish Balay   }
25824e2b4712SSatish Balay 
25834e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
25844e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2585b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
25861ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2587dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
25884e2b4712SSatish Balay   PetscFunctionReturn(0);
25894e2b4712SSatish Balay }
25904e2b4712SSatish Balay 
25918f690400SShri Abhyankar #undef __FUNCT__
25924dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7"
25934dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7(Mat A,Vec bb,Vec xx)
259435aa4fcfSShri Abhyankar {
259535aa4fcfSShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
259635aa4fcfSShri Abhyankar   IS                iscol=a->col,isrow=a->row;
259735aa4fcfSShri Abhyankar   PetscErrorCode    ierr;
2598b3260449SShri Abhyankar   const PetscInt    *r,*c,*ai=a->i,*aj=a->j,*adiag=a->diag;
2599b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*rout,*cout,*vi;
2600b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
2601b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
2602b3260449SShri Abhyankar   PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7,*x,*t;
2603b3260449SShri Abhyankar   const PetscScalar *b;
260435aa4fcfSShri Abhyankar 
260535aa4fcfSShri Abhyankar   PetscFunctionBegin;
2606b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
260735aa4fcfSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
260835aa4fcfSShri Abhyankar   t  = a->solve_work;
260935aa4fcfSShri Abhyankar 
261035aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
261135aa4fcfSShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
261235aa4fcfSShri Abhyankar 
261335aa4fcfSShri Abhyankar   /* forward solve the lower triangular */
261435aa4fcfSShri Abhyankar   idx    = 7*r[0];
261535aa4fcfSShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
261635aa4fcfSShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
261735aa4fcfSShri Abhyankar   t[5] = b[5+idx]; t[6] = b[6+idx];
261835aa4fcfSShri Abhyankar 
261935aa4fcfSShri Abhyankar   for (i=1; i<n; i++) {
262035aa4fcfSShri Abhyankar     v     = aa + 49*ai[i];
262135aa4fcfSShri Abhyankar     vi    = aj + ai[i];
262235aa4fcfSShri Abhyankar     nz    = ai[i+1] - ai[i];
262335aa4fcfSShri Abhyankar     idx   = 7*r[i];
262435aa4fcfSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
262535aa4fcfSShri Abhyankar     s5  = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
262635aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
262735aa4fcfSShri Abhyankar       idx   = 7*vi[m];
262835aa4fcfSShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
262935aa4fcfSShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
263035aa4fcfSShri Abhyankar       x6    = t[5+idx];x7 = t[6+idx];
263135aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
263235aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
263335aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
263435aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
263535aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
263635aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
263735aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
263835aa4fcfSShri Abhyankar       v += 49;
263935aa4fcfSShri Abhyankar     }
264035aa4fcfSShri Abhyankar     idx = 7*i;
264135aa4fcfSShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
264235aa4fcfSShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
264335aa4fcfSShri Abhyankar     t[5+idx] = s6;t[6+idx] = s7;
264435aa4fcfSShri Abhyankar   }
264535aa4fcfSShri Abhyankar   /* backward solve the upper triangular */
264635aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
264735aa4fcfSShri Abhyankar     v    = aa + 49*(adiag[i+1]+1);
264835aa4fcfSShri Abhyankar     vi   = aj + adiag[i+1]+1;
264935aa4fcfSShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
265035aa4fcfSShri Abhyankar     idt  = 7*i;
265135aa4fcfSShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
265235aa4fcfSShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
265335aa4fcfSShri Abhyankar     s6 = t[5+idt];s7 = t[6+idt];
265435aa4fcfSShri Abhyankar     for(m=0;m<nz;m++){
265535aa4fcfSShri Abhyankar       idx   = 7*vi[m];
265635aa4fcfSShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
265735aa4fcfSShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
265835aa4fcfSShri Abhyankar       x6    = t[5+idx]; x7 = t[6+idx];
265935aa4fcfSShri Abhyankar       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
266035aa4fcfSShri Abhyankar       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
266135aa4fcfSShri Abhyankar       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
266235aa4fcfSShri Abhyankar       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
266335aa4fcfSShri Abhyankar       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
266435aa4fcfSShri Abhyankar       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
266535aa4fcfSShri Abhyankar       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
266635aa4fcfSShri Abhyankar       v += 49;
266735aa4fcfSShri Abhyankar     }
266835aa4fcfSShri Abhyankar     idc = 7*c[i];
266935aa4fcfSShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[7]*s2+v[14]*s3+
267035aa4fcfSShri Abhyankar                                  v[21]*s4+v[28]*s5+v[35]*s6+v[42]*s7;
267135aa4fcfSShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[8]*s2+v[15]*s3+
267235aa4fcfSShri Abhyankar                                  v[22]*s4+v[29]*s5+v[36]*s6+v[43]*s7;
267335aa4fcfSShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[9]*s2+v[16]*s3+
267435aa4fcfSShri Abhyankar                                  v[23]*s4+v[30]*s5+v[37]*s6+v[44]*s7;
267535aa4fcfSShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[10]*s2+v[17]*s3+
267635aa4fcfSShri Abhyankar                                  v[24]*s4+v[31]*s5+v[38]*s6+v[45]*s7;
267735aa4fcfSShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[11]*s2+v[18]*s3+
267835aa4fcfSShri Abhyankar                                  v[25]*s4+v[32]*s5+v[39]*s6+v[46]*s7;
267935aa4fcfSShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[12]*s2+v[19]*s3+
268035aa4fcfSShri Abhyankar                                  v[26]*s4+v[33]*s5+v[40]*s6+v[47]*s7;
268135aa4fcfSShri Abhyankar     x[6+idc] = t[6+idt] = v[6]*s1+v[13]*s2+v[20]*s3+
268235aa4fcfSShri Abhyankar                                  v[27]*s4+v[34]*s5+v[41]*s6+v[48]*s7;
268335aa4fcfSShri Abhyankar   }
268435aa4fcfSShri Abhyankar 
268535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
268635aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2687b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
268835aa4fcfSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
268935aa4fcfSShri Abhyankar   ierr = PetscLogFlops(2.0*49*(a->nz) - 7.0*A->cmap->n);CHKERRQ(ierr);
269035aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
269135aa4fcfSShri Abhyankar }
269235aa4fcfSShri Abhyankar 
269335aa4fcfSShri Abhyankar #undef __FUNCT__
269406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering_inplace"
269506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
269615091d37SBarry Smith {
269715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2698b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2699dfbe8321SBarry Smith   PetscErrorCode    ierr;
2700b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
2701d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2702d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
2703d9fead3dSBarry Smith   const PetscScalar *b;
270415091d37SBarry Smith 
270515091d37SBarry Smith   PetscFunctionBegin;
2706d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27071ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
270815091d37SBarry Smith   /* forward solve the lower triangular */
270915091d37SBarry Smith   idx    = 0;
271015091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
271115091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
271215091d37SBarry Smith   x[6] = b[6+idx];
271315091d37SBarry Smith   for (i=1; i<n; i++) {
271415091d37SBarry Smith     v     =  aa + 49*ai[i];
271515091d37SBarry Smith     vi    =  aj + ai[i];
271615091d37SBarry Smith     nz    =  diag[i] - ai[i];
271715091d37SBarry Smith     idx   =  7*i;
2718f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
2719f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
2720f1af5d2fSBarry Smith     s7  =  b[6+idx];
272115091d37SBarry Smith     while (nz--) {
272215091d37SBarry Smith       jdx   = 7*(*vi++);
272315091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
272415091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
272515091d37SBarry Smith       x7    = x[6+jdx];
2726f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2727f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2728f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2729f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2730f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2731f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2732f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
273315091d37SBarry Smith       v += 49;
273415091d37SBarry Smith      }
2735f1af5d2fSBarry Smith     x[idx]   = s1;
2736f1af5d2fSBarry Smith     x[1+idx] = s2;
2737f1af5d2fSBarry Smith     x[2+idx] = s3;
2738f1af5d2fSBarry Smith     x[3+idx] = s4;
2739f1af5d2fSBarry Smith     x[4+idx] = s5;
2740f1af5d2fSBarry Smith     x[5+idx] = s6;
2741f1af5d2fSBarry Smith     x[6+idx] = s7;
274215091d37SBarry Smith   }
274315091d37SBarry Smith   /* backward solve the upper triangular */
274415091d37SBarry Smith   for (i=n-1; i>=0; i--){
274515091d37SBarry Smith     v    = aa + 49*diag[i] + 49;
274615091d37SBarry Smith     vi   = aj + diag[i] + 1;
274715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
274815091d37SBarry Smith     idt  = 7*i;
2749f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
2750f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
2751f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
2752f1af5d2fSBarry Smith     s7 = x[6+idt];
275315091d37SBarry Smith     while (nz--) {
275415091d37SBarry Smith       idx   = 7*(*vi++);
275515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
275615091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
275715091d37SBarry Smith       x7    = x[6+idx];
2758f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[7]*x2  + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
2759f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[8]*x2  + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
2760f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[9]*x2  + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
2761f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
2762f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
2763f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
2764f1af5d2fSBarry Smith       s7 -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
276515091d37SBarry Smith       v += 49;
276615091d37SBarry Smith     }
276715091d37SBarry Smith     v        = aa + 49*diag[i];
2768f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[7]*s2  + v[14]*s3 + v[21]*s4
2769f1af5d2fSBarry Smith                          + v[28]*s5 + v[35]*s6 + v[42]*s7;
2770f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[8]*s2  + v[15]*s3 + v[22]*s4
2771f1af5d2fSBarry Smith                          + v[29]*s5 + v[36]*s6 + v[43]*s7;
2772f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[9]*s2  + v[16]*s3 + v[23]*s4
2773f1af5d2fSBarry Smith                          + v[30]*s5 + v[37]*s6 + v[44]*s7;
2774f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[10]*s2  + v[17]*s3 + v[24]*s4
2775f1af5d2fSBarry Smith                          + v[31]*s5 + v[38]*s6 + v[45]*s7;
2776f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[11]*s2  + v[18]*s3 + v[25]*s4
2777f1af5d2fSBarry Smith                          + v[32]*s5 + v[39]*s6 + v[46]*s7;
2778f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[12]*s2  + v[19]*s3 + v[26]*s4
2779f1af5d2fSBarry Smith                          + v[33]*s5 + v[40]*s6 + v[47]*s7;
2780f1af5d2fSBarry Smith     x[6+idt] = v[6]*s1 + v[13]*s2  + v[20]*s3 + v[27]*s4
2781f1af5d2fSBarry Smith                          + v[34]*s5 + v[41]*s6 + v[48]*s7;
278215091d37SBarry Smith   }
278315091d37SBarry Smith 
2784d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
27851ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2786dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
278715091d37SBarry Smith   PetscFunctionReturn(0);
278815091d37SBarry Smith }
278915091d37SBarry Smith 
2790cee9d6f2SShri Abhyankar #undef __FUNCT__
27914dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_7_NaturalOrdering"
27924dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_7_NaturalOrdering(Mat A,Vec bb,Vec xx)
279353cca76cSShri Abhyankar {
279453cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
2795b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
279653cca76cSShri Abhyankar     PetscErrorCode    ierr;
2797b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
2798b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
279953cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
280053cca76cSShri Abhyankar     PetscScalar       *x;
280153cca76cSShri Abhyankar     const PetscScalar *b;
280253cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,s7,x1,x2,x3,x4,x5,x6,x7;
280353cca76cSShri Abhyankar 
280453cca76cSShri Abhyankar     PetscFunctionBegin;
280553cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
280653cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
280753cca76cSShri Abhyankar     /* forward solve the lower triangular */
280853cca76cSShri Abhyankar     idx    = 0;
280953cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
281053cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];x[6] = b[6+idx];
281153cca76cSShri Abhyankar     for (i=1; i<n; i++) {
281253cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
281353cca76cSShri Abhyankar        vi   = aj + ai[i];
281453cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
281553cca76cSShri Abhyankar       idx   = bs*i;
281653cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
281753cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];s7 = b[6+idx];
281853cca76cSShri Abhyankar        for(k=0;k<nz;k++) {
281953cca76cSShri Abhyankar           jdx   = bs*vi[k];
282053cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
282153cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];x7 = x[6+jdx];
282253cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
282353cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
282453cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
282553cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
282653cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
282753cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
282853cca76cSShri Abhyankar 	  s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
282953cca76cSShri Abhyankar           v   +=  bs2;
283053cca76cSShri Abhyankar         }
283153cca76cSShri Abhyankar 
283253cca76cSShri Abhyankar        x[idx]   = s1;
283353cca76cSShri Abhyankar        x[1+idx] = s2;
283453cca76cSShri Abhyankar        x[2+idx] = s3;
283553cca76cSShri Abhyankar        x[3+idx] = s4;
283653cca76cSShri Abhyankar        x[4+idx] = s5;
283753cca76cSShri Abhyankar        x[5+idx] = s6;
283853cca76cSShri Abhyankar        x[6+idx] = s7;
283953cca76cSShri Abhyankar     }
284053cca76cSShri Abhyankar 
284153cca76cSShri Abhyankar    /* backward solve the upper triangular */
284253cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
284353cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
284453cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
284553cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
284653cca76cSShri Abhyankar      idt = bs*i;
284753cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
284853cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];s7 = x[6+idt];
284953cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
285053cca76cSShri Abhyankar       idx   = bs*vi[k];
285153cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
285253cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];x7 = x[6+idx];
285353cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4  + v[28]*x5 + v[35]*x6 + v[42]*x7;
285453cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4  + v[29]*x5 + v[36]*x6 + v[43]*x7;
285553cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4  + v[30]*x5 + v[37]*x6 + v[44]*x7;
285653cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4  + v[31]*x5 + v[38]*x6 + v[45]*x7;
285753cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4  + v[32]*x5 + v[39]*x6 + v[46]*x7;
285853cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4  + v[33]*x5 + v[40]*x6 + v[47]*x7;
285953cca76cSShri Abhyankar        s7   -= v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4  + v[34]*x5 + v[41]*x6 + v[48]*x7;
286053cca76cSShri Abhyankar         v   +=  bs2;
286153cca76cSShri Abhyankar     }
286253cca76cSShri Abhyankar     /* x = inv_diagonal*x */
286353cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[7]*s2 + v[14]*s3 + v[21]*s4  + v[28]*s5 + v[35]*s6 + v[42]*s7;
286453cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[8]*s2 + v[15]*s3 + v[22]*s4  + v[29]*s5 + v[36]*s6 + v[43]*s7;
286553cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[9]*s2 + v[16]*s3 + v[23]*s4  + v[30]*s5 + v[37]*s6 + v[44]*s7;
286653cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[10]*s2 + v[17]*s3 + v[24]*s4  + v[31]*s5 + v[38]*s6 + v[45]*s7;
286753cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[11]*s2 + v[18]*s3 + v[25]*s4  + v[32]*s5 + v[39]*s6 + v[46]*s7;
286853cca76cSShri Abhyankar     x[5+idt] = v[5]*s1 + v[12]*s2 + v[19]*s3 + v[26]*s4  + v[33]*s5 + v[40]*s6 + v[47]*s7;
286953cca76cSShri Abhyankar     x[6+idt] = v[6]*s1 + v[13]*s2 + v[20]*s3 + v[27]*s4  + v[34]*s5 + v[41]*s6 + v[48]*s7;
287053cca76cSShri Abhyankar   }
287153cca76cSShri Abhyankar 
287253cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
287353cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
287453cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
287553cca76cSShri Abhyankar   PetscFunctionReturn(0);
287653cca76cSShri Abhyankar }
287753cca76cSShri Abhyankar 
287853cca76cSShri Abhyankar #undef __FUNCT__
287906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_inplace"
288006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_inplace(Mat A,Vec bb,Vec xx)
288115091d37SBarry Smith {
288215091d37SBarry Smith   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
288315091d37SBarry Smith   IS                iscol=a->col,isrow=a->row;
28846849ba73SBarry Smith   PetscErrorCode    ierr;
28855d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout;
2886b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
2887b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
2888d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
2889d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
2890d9fead3dSBarry Smith   const PetscScalar *b;
2891b3260449SShri Abhyankar 
289215091d37SBarry Smith   PetscFunctionBegin;
2893d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
28941ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
2895f1af5d2fSBarry Smith   t  = a->solve_work;
289615091d37SBarry Smith 
289715091d37SBarry Smith   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
289815091d37SBarry Smith   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
289915091d37SBarry Smith 
290015091d37SBarry Smith   /* forward solve the lower triangular */
290115091d37SBarry Smith   idx    = 6*(*r++);
2902f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
2903f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
2904f1af5d2fSBarry Smith   t[4] = b[4+idx]; t[5] = b[5+idx];
290515091d37SBarry Smith   for (i=1; i<n; i++) {
290615091d37SBarry Smith     v     = aa + 36*ai[i];
290715091d37SBarry Smith     vi    = aj + ai[i];
290815091d37SBarry Smith     nz    = diag[i] - ai[i];
290915091d37SBarry Smith     idx   = 6*(*r++);
2910f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
2911f1af5d2fSBarry Smith     s5  = b[4+idx]; s6 = b[5+idx];
291215091d37SBarry Smith     while (nz--) {
291315091d37SBarry Smith       idx   = 6*(*vi++);
2914f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
2915f1af5d2fSBarry Smith       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
2916f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2917f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2918f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2919f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2920f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2921f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
292215091d37SBarry Smith       v += 36;
292315091d37SBarry Smith     }
292415091d37SBarry Smith     idx = 6*i;
2925f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
2926f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
2927f1af5d2fSBarry Smith     t[4+idx] = s5;t[5+idx] = s6;
292815091d37SBarry Smith   }
292915091d37SBarry Smith   /* backward solve the upper triangular */
293015091d37SBarry Smith   for (i=n-1; i>=0; i--){
293115091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
293215091d37SBarry Smith     vi   = aj + diag[i] + 1;
293315091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
293415091d37SBarry Smith     idt  = 6*i;
2935f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
2936f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
2937f1af5d2fSBarry Smith     s5 = t[4+idt];s6 = t[5+idt];
293815091d37SBarry Smith     while (nz--) {
293915091d37SBarry Smith       idx   = 6*(*vi++);
2940f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
2941f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
2942f1af5d2fSBarry Smith       x5    = t[4+idx]; x6 = t[5+idx];
2943f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
2944f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
2945f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
2946f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
2947f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
2948f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
294915091d37SBarry Smith       v += 36;
295015091d37SBarry Smith     }
295115091d37SBarry Smith     idc = 6*(*c--);
295215091d37SBarry Smith     v   = aa + 36*diag[i];
2953f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
2954f1af5d2fSBarry Smith                                  v[18]*s4+v[24]*s5+v[30]*s6;
2955f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
2956f1af5d2fSBarry Smith                                  v[19]*s4+v[25]*s5+v[31]*s6;
2957f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
2958f1af5d2fSBarry Smith                                  v[20]*s4+v[26]*s5+v[32]*s6;
2959f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
2960f1af5d2fSBarry Smith                                  v[21]*s4+v[27]*s5+v[33]*s6;
2961f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
2962f1af5d2fSBarry Smith                                  v[22]*s4+v[28]*s5+v[34]*s6;
2963f1af5d2fSBarry Smith     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
2964f1af5d2fSBarry Smith                                  v[23]*s4+v[29]*s5+v[35]*s6;
296515091d37SBarry Smith   }
296615091d37SBarry Smith 
296715091d37SBarry Smith   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
296815091d37SBarry Smith   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
2969d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29701ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
2971dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
297215091d37SBarry Smith   PetscFunctionReturn(0);
297315091d37SBarry Smith }
297415091d37SBarry Smith 
29756506fda5SShri Abhyankar #undef __FUNCT__
29764dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6"
29774dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6(Mat A,Vec bb,Vec xx)
29786506fda5SShri Abhyankar {
29796506fda5SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
29806506fda5SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
29816506fda5SShri Abhyankar   PetscErrorCode    ierr;
29826506fda5SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
2983b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
2984b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
29856506fda5SShri Abhyankar   const MatScalar   *aa=a->a,*v;
29866506fda5SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6,*t;
29876506fda5SShri Abhyankar   const PetscScalar *b;
2988b3260449SShri Abhyankar 
29896506fda5SShri Abhyankar   PetscFunctionBegin;
29906506fda5SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
29916506fda5SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
29926506fda5SShri Abhyankar   t  = a->solve_work;
29936506fda5SShri Abhyankar 
29946506fda5SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
29956506fda5SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
29966506fda5SShri Abhyankar 
29976506fda5SShri Abhyankar   /* forward solve the lower triangular */
29986506fda5SShri Abhyankar   idx    = 6*r[0];
29996506fda5SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
30006506fda5SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
30016506fda5SShri Abhyankar   t[4] = b[4+idx]; t[5] = b[5+idx];
30026506fda5SShri Abhyankar   for (i=1; i<n; i++) {
30036506fda5SShri Abhyankar     v     = aa + 36*ai[i];
30046506fda5SShri Abhyankar     vi    = aj + ai[i];
30056506fda5SShri Abhyankar     nz    = ai[i+1] - ai[i];
30066506fda5SShri Abhyankar     idx   = 6*r[i];
30076506fda5SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
30086506fda5SShri Abhyankar     s5  = b[4+idx]; s6 = b[5+idx];
30096506fda5SShri Abhyankar     for(m=0;m<nz;m++){
30106506fda5SShri Abhyankar       idx   = 6*vi[m];
30116506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx]; x3 = t[2+idx];
30126506fda5SShri Abhyankar       x4    = t[3+idx]; x5 = t[4+idx]; x6 = t[5+idx];
30136506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
30146506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
30156506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
30166506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
30176506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
30186506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
30196506fda5SShri Abhyankar       v += 36;
30206506fda5SShri Abhyankar     }
30216506fda5SShri Abhyankar     idx = 6*i;
30226506fda5SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
30236506fda5SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
30246506fda5SShri Abhyankar     t[4+idx] = s5;t[5+idx] = s6;
30256506fda5SShri Abhyankar   }
30266506fda5SShri Abhyankar   /* backward solve the upper triangular */
30276506fda5SShri Abhyankar   for (i=n-1; i>=0; i--){
30286506fda5SShri Abhyankar     v    = aa + 36*(adiag[i+1]+1);
30296506fda5SShri Abhyankar     vi   = aj + adiag[i+1]+1;
30306506fda5SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
30316506fda5SShri Abhyankar     idt  = 6*i;
30326506fda5SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
30336506fda5SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
30346506fda5SShri Abhyankar     s5 = t[4+idt];s6 = t[5+idt];
30356506fda5SShri Abhyankar     for(m=0;m<nz;m++){
30366506fda5SShri Abhyankar       idx   = 6*vi[m];
30376506fda5SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
30386506fda5SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
30396506fda5SShri Abhyankar       x5    = t[4+idx]; x6 = t[5+idx];
30406506fda5SShri Abhyankar       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
30416506fda5SShri Abhyankar       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
30426506fda5SShri Abhyankar       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
30436506fda5SShri Abhyankar       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
30446506fda5SShri Abhyankar       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
30456506fda5SShri Abhyankar       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
30466506fda5SShri Abhyankar       v += 36;
30476506fda5SShri Abhyankar     }
30486506fda5SShri Abhyankar     idc = 6*c[i];
30496506fda5SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[6]*s2+v[12]*s3+
30506506fda5SShri Abhyankar                                  v[18]*s4+v[24]*s5+v[30]*s6;
30516506fda5SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[7]*s2+v[13]*s3+
30526506fda5SShri Abhyankar                                  v[19]*s4+v[25]*s5+v[31]*s6;
30536506fda5SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[8]*s2+v[14]*s3+
30546506fda5SShri Abhyankar                                  v[20]*s4+v[26]*s5+v[32]*s6;
30556506fda5SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[9]*s2+v[15]*s3+
30566506fda5SShri Abhyankar                                  v[21]*s4+v[27]*s5+v[33]*s6;
30576506fda5SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[10]*s2+v[16]*s3+
30586506fda5SShri Abhyankar                                  v[22]*s4+v[28]*s5+v[34]*s6;
30596506fda5SShri Abhyankar     x[5+idc] = t[5+idt] = v[5]*s1+v[11]*s2+v[17]*s3+
30606506fda5SShri Abhyankar                                  v[23]*s4+v[29]*s5+v[35]*s6;
30616506fda5SShri Abhyankar   }
30626506fda5SShri Abhyankar 
30636506fda5SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
30646506fda5SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
30656506fda5SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30666506fda5SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
30676506fda5SShri Abhyankar   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
30686506fda5SShri Abhyankar   PetscFunctionReturn(0);
30696506fda5SShri Abhyankar }
30708f690400SShri Abhyankar 
30718f690400SShri Abhyankar #undef __FUNCT__
307206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering_inplace"
307306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
307415091d37SBarry Smith {
307515091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3076b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3077dfbe8321SBarry Smith   PetscErrorCode    ierr;
3078b3260449SShri Abhyankar   const PetscInt    *diag = a->diag,*vi,n=a->mbs,*ai=a->i,*aj=a->j;
3079d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3080d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
3081d9fead3dSBarry Smith   const PetscScalar *b;
308215091d37SBarry Smith 
308315091d37SBarry Smith   PetscFunctionBegin;
3084d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
30851ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
308615091d37SBarry Smith   /* forward solve the lower triangular */
308715091d37SBarry Smith   idx    = 0;
308815091d37SBarry Smith   x[0] = b[idx];   x[1] = b[1+idx]; x[2] = b[2+idx];
308915091d37SBarry Smith   x[3] = b[3+idx]; x[4] = b[4+idx]; x[5] = b[5+idx];
309015091d37SBarry Smith   for (i=1; i<n; i++) {
309115091d37SBarry Smith     v     =  aa + 36*ai[i];
309215091d37SBarry Smith     vi    =  aj + ai[i];
309315091d37SBarry Smith     nz    =  diag[i] - ai[i];
309415091d37SBarry Smith     idx   =  6*i;
3095f1af5d2fSBarry Smith     s1  =  b[idx];   s2 = b[1+idx]; s3 = b[2+idx];
3096f1af5d2fSBarry Smith     s4  =  b[3+idx]; s5 = b[4+idx]; s6 = b[5+idx];
309715091d37SBarry Smith     while (nz--) {
309815091d37SBarry Smith       jdx   = 6*(*vi++);
309915091d37SBarry Smith       x1    = x[jdx];   x2 = x[1+jdx]; x3 = x[2+jdx];
310015091d37SBarry Smith       x4    = x[3+jdx]; x5 = x[4+jdx]; x6 = x[5+jdx];
3101f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3102f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3103f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3104f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3105f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3106f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
310715091d37SBarry Smith       v += 36;
310815091d37SBarry Smith      }
3109f1af5d2fSBarry Smith     x[idx]   = s1;
3110f1af5d2fSBarry Smith     x[1+idx] = s2;
3111f1af5d2fSBarry Smith     x[2+idx] = s3;
3112f1af5d2fSBarry Smith     x[3+idx] = s4;
3113f1af5d2fSBarry Smith     x[4+idx] = s5;
3114f1af5d2fSBarry Smith     x[5+idx] = s6;
311515091d37SBarry Smith   }
311615091d37SBarry Smith   /* backward solve the upper triangular */
311715091d37SBarry Smith   for (i=n-1; i>=0; i--){
311815091d37SBarry Smith     v    = aa + 36*diag[i] + 36;
311915091d37SBarry Smith     vi   = aj + diag[i] + 1;
312015091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
312115091d37SBarry Smith     idt  = 6*i;
3122f1af5d2fSBarry Smith     s1 = x[idt];   s2 = x[1+idt];
3123f1af5d2fSBarry Smith     s3 = x[2+idt]; s4 = x[3+idt];
3124f1af5d2fSBarry Smith     s5 = x[4+idt]; s6 = x[5+idt];
312515091d37SBarry Smith     while (nz--) {
312615091d37SBarry Smith       idx   = 6*(*vi++);
312715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
312815091d37SBarry Smith       x4    = x[3+idx]; x5 = x[4+idx]; x6 = x[5+idx];
3129f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[6]*x2  + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
3130f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[7]*x2  + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
3131f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[8]*x2  + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
3132f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[9]*x2  + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
3133f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
3134f1af5d2fSBarry Smith       s6 -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
313515091d37SBarry Smith       v += 36;
313615091d37SBarry Smith     }
313715091d37SBarry Smith     v        = aa + 36*diag[i];
3138f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[6]*s2  + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
3139f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[7]*s2  + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
3140f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[8]*s2  + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
3141f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[9]*s2  + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
3142f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
3143f1af5d2fSBarry Smith     x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
314415091d37SBarry Smith   }
314515091d37SBarry Smith 
3146d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
31471ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3148dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*36*(a->nz) - 6.0*A->cmap->n);CHKERRQ(ierr);
314915091d37SBarry Smith   PetscFunctionReturn(0);
315015091d37SBarry Smith }
315115091d37SBarry Smith 
3152cee9d6f2SShri Abhyankar #undef __FUNCT__
31534dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_6_NaturalOrdering"
31544dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_6_NaturalOrdering(Mat A,Vec bb,Vec xx)
315553cca76cSShri Abhyankar {
315653cca76cSShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3157b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
315853cca76cSShri Abhyankar     PetscErrorCode    ierr;
3159b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
3160b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
316153cca76cSShri Abhyankar     const MatScalar   *aa=a->a,*v;
316253cca76cSShri Abhyankar     PetscScalar       *x;
316353cca76cSShri Abhyankar     const PetscScalar *b;
316453cca76cSShri Abhyankar     PetscScalar       s1,s2,s3,s4,s5,s6,x1,x2,x3,x4,x5,x6;
316553cca76cSShri Abhyankar 
316653cca76cSShri Abhyankar     PetscFunctionBegin;
316753cca76cSShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
316853cca76cSShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
316953cca76cSShri Abhyankar     /* forward solve the lower triangular */
317053cca76cSShri Abhyankar     idx    = 0;
317153cca76cSShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
317253cca76cSShri Abhyankar     x[4] = b[4+idx];x[5] = b[5+idx];
317353cca76cSShri Abhyankar     for (i=1; i<n; i++) {
317453cca76cSShri Abhyankar        v    = aa + bs2*ai[i];
317553cca76cSShri Abhyankar        vi   = aj + ai[i];
317653cca76cSShri Abhyankar        nz   = ai[i+1] - ai[i];
317753cca76cSShri Abhyankar       idx   = bs*i;
317853cca76cSShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
317953cca76cSShri Abhyankar        s5   = b[4+idx];s6 = b[5+idx];
318053cca76cSShri Abhyankar        for(k=0;k<nz;k++){
318153cca76cSShri Abhyankar           jdx   = bs*vi[k];
318253cca76cSShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
318353cca76cSShri Abhyankar 	  x5    = x[4+jdx]; x6 = x[5+jdx];
318453cca76cSShri Abhyankar           s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
318553cca76cSShri Abhyankar           s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
318653cca76cSShri Abhyankar           s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
318753cca76cSShri Abhyankar 	  s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
318853cca76cSShri Abhyankar           s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
318953cca76cSShri Abhyankar 	  s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
319053cca76cSShri Abhyankar           v   +=  bs2;
319153cca76cSShri Abhyankar         }
319253cca76cSShri Abhyankar 
319353cca76cSShri Abhyankar        x[idx]   = s1;
319453cca76cSShri Abhyankar        x[1+idx] = s2;
319553cca76cSShri Abhyankar        x[2+idx] = s3;
319653cca76cSShri Abhyankar        x[3+idx] = s4;
319753cca76cSShri Abhyankar        x[4+idx] = s5;
319853cca76cSShri Abhyankar        x[5+idx] = s6;
319953cca76cSShri Abhyankar     }
320053cca76cSShri Abhyankar 
320153cca76cSShri Abhyankar    /* backward solve the upper triangular */
320253cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
320353cca76cSShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
320453cca76cSShri Abhyankar      vi  = aj + adiag[i+1]+1;
320553cca76cSShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
320653cca76cSShri Abhyankar      idt = bs*i;
320753cca76cSShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
320853cca76cSShri Abhyankar      s5 = x[4+idt];s6 = x[5+idt];
320953cca76cSShri Abhyankar      for(k=0;k<nz;k++){
321053cca76cSShri Abhyankar       idx   = bs*vi[k];
321153cca76cSShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
321253cca76cSShri Abhyankar        x5    = x[4+idx];x6 = x[5+idx];
321353cca76cSShri Abhyankar        s1   -= v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4  + v[24]*x5 + v[30]*x6;
321453cca76cSShri Abhyankar        s2   -= v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4  + v[25]*x5 + v[31]*x6;;
321553cca76cSShri Abhyankar        s3   -= v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4  + v[26]*x5 + v[32]*x6;
321653cca76cSShri Abhyankar        s4   -= v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4  + v[27]*x5 + v[33]*x6;
321753cca76cSShri Abhyankar        s5   -= v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4  + v[28]*x5 + v[34]*x6;
321853cca76cSShri Abhyankar        s6   -= v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4  + v[29]*x5 + v[35]*x6;
321953cca76cSShri Abhyankar         v   +=  bs2;
322053cca76cSShri Abhyankar     }
322153cca76cSShri Abhyankar     /* x = inv_diagonal*x */
322253cca76cSShri Abhyankar    x[idt]   = v[0]*s1 + v[6]*s2 + v[12]*s3 + v[18]*s4 + v[24]*s5 + v[30]*s6;
322353cca76cSShri Abhyankar    x[1+idt] = v[1]*s1 + v[7]*s2 + v[13]*s3 + v[19]*s4 + v[25]*s5 + v[31]*s6;
322453cca76cSShri Abhyankar    x[2+idt] = v[2]*s1 + v[8]*s2 + v[14]*s3 + v[20]*s4 + v[26]*s5 + v[32]*s6;
322553cca76cSShri Abhyankar    x[3+idt] = v[3]*s1 + v[9]*s2 + v[15]*s3 + v[21]*s4 + v[27]*s5 + v[33]*s6;
322653cca76cSShri Abhyankar    x[4+idt] = v[4]*s1 + v[10]*s2 + v[16]*s3 + v[22]*s4 + v[28]*s5 + v[34]*s6;
322753cca76cSShri Abhyankar    x[5+idt] = v[5]*s1 + v[11]*s2 + v[17]*s3 + v[23]*s4 + v[29]*s5 + v[35]*s6;
322853cca76cSShri Abhyankar   }
322953cca76cSShri Abhyankar 
323053cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
323153cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
323253cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
323353cca76cSShri Abhyankar   PetscFunctionReturn(0);
323453cca76cSShri Abhyankar }
323553cca76cSShri Abhyankar 
323653cca76cSShri Abhyankar #undef __FUNCT__
323706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_inplace"
323806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_inplace(Mat A,Vec bb,Vec xx)
32394e2b4712SSatish Balay {
32404e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
32414e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
32426849ba73SBarry Smith   PetscErrorCode    ierr;
32435d0c19d7SBarry Smith   const PetscInt    *r,*c,*rout,*cout,*diag = a->diag;
3244b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3245b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
3246d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3247d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
3248d9fead3dSBarry Smith   const PetscScalar *b;
32494e2b4712SSatish Balay 
32504e2b4712SSatish Balay   PetscFunctionBegin;
3251d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
32521ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3253f1af5d2fSBarry Smith   t  = a->solve_work;
32544e2b4712SSatish Balay 
32554e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
32564e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
32574e2b4712SSatish Balay 
32584e2b4712SSatish Balay   /* forward solve the lower triangular */
32594e2b4712SSatish Balay   idx    = 5*(*r++);
3260f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3261f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
32624e2b4712SSatish Balay   for (i=1; i<n; i++) {
32634e2b4712SSatish Balay     v     = aa + 25*ai[i];
32644e2b4712SSatish Balay     vi    = aj + ai[i];
32654e2b4712SSatish Balay     nz    = diag[i] - ai[i];
32664e2b4712SSatish Balay     idx   = 5*(*r++);
3267f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
3268f1af5d2fSBarry Smith     s5  = b[4+idx];
32694e2b4712SSatish Balay     while (nz--) {
32704e2b4712SSatish Balay       idx   = 5*(*vi++);
3271f1af5d2fSBarry Smith       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
3272f1af5d2fSBarry Smith       x4    = t[3+idx];x5 = t[4+idx];
3273f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3274f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3275f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3276f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3277f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
32784e2b4712SSatish Balay       v += 25;
32794e2b4712SSatish Balay     }
32804e2b4712SSatish Balay     idx = 5*i;
3281f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3282f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
32834e2b4712SSatish Balay   }
32844e2b4712SSatish Balay   /* backward solve the upper triangular */
32854e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
32864e2b4712SSatish Balay     v    = aa + 25*diag[i] + 25;
32874e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
32884e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
32894e2b4712SSatish Balay     idt  = 5*i;
3290f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3291f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
32924e2b4712SSatish Balay     while (nz--) {
32934e2b4712SSatish Balay       idx   = 5*(*vi++);
3294f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3295f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
3296f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
3297f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
3298f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
3299f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
3300f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
33014e2b4712SSatish Balay       v += 25;
33024e2b4712SSatish Balay     }
33034e2b4712SSatish Balay     idc = 5*(*c--);
33044e2b4712SSatish Balay     v   = aa + 25*diag[i];
3305f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
3306f1af5d2fSBarry Smith                                  v[15]*s4+v[20]*s5;
3307f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
3308f1af5d2fSBarry Smith                                  v[16]*s4+v[21]*s5;
3309f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
3310f1af5d2fSBarry Smith                                  v[17]*s4+v[22]*s5;
3311f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
3312f1af5d2fSBarry Smith                                  v[18]*s4+v[23]*s5;
3313f1af5d2fSBarry Smith     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
3314f1af5d2fSBarry Smith                                  v[19]*s4+v[24]*s5;
33154e2b4712SSatish Balay   }
33164e2b4712SSatish Balay 
33174e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
33184e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3319d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
33201ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3321dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
33224e2b4712SSatish Balay   PetscFunctionReturn(0);
33234e2b4712SSatish Balay }
33244e2b4712SSatish Balay 
332578bb4007SShri Abhyankar #undef __FUNCT__
33264dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5"
33274dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5(Mat A,Vec bb,Vec xx)
332878bb4007SShri Abhyankar {
332978bb4007SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
333078bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
333178bb4007SShri Abhyankar   PetscErrorCode    ierr;
333278bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
3333b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3334b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
333578bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
333678bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5,*t;
333778bb4007SShri Abhyankar   const PetscScalar *b;
333878bb4007SShri Abhyankar 
333978bb4007SShri Abhyankar   PetscFunctionBegin;
334078bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
334178bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
334278bb4007SShri Abhyankar   t  = a->solve_work;
334378bb4007SShri Abhyankar 
334478bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
334578bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
334678bb4007SShri Abhyankar 
334778bb4007SShri Abhyankar   /* forward solve the lower triangular */
334878bb4007SShri Abhyankar   idx    = 5*r[0];
334978bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
335078bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx]; t[4] = b[4+idx];
335178bb4007SShri Abhyankar   for (i=1; i<n; i++) {
335278bb4007SShri Abhyankar     v     = aa + 25*ai[i];
335378bb4007SShri Abhyankar     vi    = aj + ai[i];
335478bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
335578bb4007SShri Abhyankar     idx   = 5*r[i];
335678bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
335778bb4007SShri Abhyankar     s5  = b[4+idx];
335878bb4007SShri Abhyankar     for(m=0;m<nz;m++){
335978bb4007SShri Abhyankar       idx   = 5*vi[m];
336078bb4007SShri Abhyankar       x1    = t[idx];  x2 = t[1+idx];x3 = t[2+idx];
336178bb4007SShri Abhyankar       x4    = t[3+idx];x5 = t[4+idx];
336278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
336378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
336478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
336578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
336678bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
336778bb4007SShri Abhyankar       v += 25;
336878bb4007SShri Abhyankar     }
336978bb4007SShri Abhyankar     idx = 5*i;
337078bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
337178bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4; t[4+idx] = s5;
337278bb4007SShri Abhyankar   }
337378bb4007SShri Abhyankar   /* backward solve the upper triangular */
337478bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
337578bb4007SShri Abhyankar     v    = aa + 25*(adiag[i+1]+1);
337678bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
337778bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
337878bb4007SShri Abhyankar     idt  = 5*i;
337978bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
338078bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt]; s5 = t[4+idt];
338178bb4007SShri Abhyankar     for(m=0;m<nz;m++){
338278bb4007SShri Abhyankar       idx   = 5*vi[m];
338378bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
338478bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx]; x5 = t[4+idx];
338578bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
338678bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
338778bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
338878bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
338978bb4007SShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
339078bb4007SShri Abhyankar       v += 25;
339178bb4007SShri Abhyankar     }
339278bb4007SShri Abhyankar     idc = 5*c[i];
339378bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[5]*s2+v[10]*s3+
339478bb4007SShri Abhyankar                                  v[15]*s4+v[20]*s5;
339578bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[6]*s2+v[11]*s3+
339678bb4007SShri Abhyankar                                  v[16]*s4+v[21]*s5;
339778bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[7]*s2+v[12]*s3+
339878bb4007SShri Abhyankar                                  v[17]*s4+v[22]*s5;
339978bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[8]*s2+v[13]*s3+
340078bb4007SShri Abhyankar                                  v[18]*s4+v[23]*s5;
340178bb4007SShri Abhyankar     x[4+idc] = t[4+idt] = v[4]*s1+v[9]*s2+v[14]*s3+
340278bb4007SShri Abhyankar                                  v[19]*s4+v[24]*s5;
340378bb4007SShri Abhyankar   }
340478bb4007SShri Abhyankar 
340578bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
340678bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
340778bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
340878bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
340978bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
341078bb4007SShri Abhyankar   PetscFunctionReturn(0);
341178bb4007SShri Abhyankar }
341278bb4007SShri Abhyankar 
34138f690400SShri Abhyankar #undef __FUNCT__
341406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering_inplace"
341506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
341615091d37SBarry Smith {
341715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3418b3260449SShri Abhyankar   const PetscInt    *diag=a->diag,n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3419b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,jdx;
3420dfbe8321SBarry Smith   PetscErrorCode    ierr;
3421d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3422d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
3423d9fead3dSBarry Smith   const PetscScalar *b;
342415091d37SBarry Smith 
342515091d37SBarry Smith   PetscFunctionBegin;
3426d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
34271ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
342815091d37SBarry Smith   /* forward solve the lower triangular */
342915091d37SBarry Smith   idx    = 0;
343015091d37SBarry Smith   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
343115091d37SBarry Smith   for (i=1; i<n; i++) {
343215091d37SBarry Smith     v     =  aa + 25*ai[i];
343315091d37SBarry Smith     vi    =  aj + ai[i];
343415091d37SBarry Smith     nz    =  diag[i] - ai[i];
343515091d37SBarry Smith     idx   =  5*i;
3436f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
343715091d37SBarry Smith     while (nz--) {
343815091d37SBarry Smith       jdx   = 5*(*vi++);
343915091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
3440f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3441f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3442f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3443f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3444f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
344515091d37SBarry Smith       v    += 25;
344615091d37SBarry Smith     }
3447f1af5d2fSBarry Smith     x[idx]   = s1;
3448f1af5d2fSBarry Smith     x[1+idx] = s2;
3449f1af5d2fSBarry Smith     x[2+idx] = s3;
3450f1af5d2fSBarry Smith     x[3+idx] = s4;
3451f1af5d2fSBarry Smith     x[4+idx] = s5;
345215091d37SBarry Smith   }
345315091d37SBarry Smith   /* backward solve the upper triangular */
345415091d37SBarry Smith   for (i=n-1; i>=0; i--){
345515091d37SBarry Smith     v    = aa + 25*diag[i] + 25;
345615091d37SBarry Smith     vi   = aj + diag[i] + 1;
345715091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
345815091d37SBarry Smith     idt  = 5*i;
3459f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
3460f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
346115091d37SBarry Smith     while (nz--) {
346215091d37SBarry Smith       idx   = 5*(*vi++);
346315091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
3464f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
3465f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
3466f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
3467f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
3468f1af5d2fSBarry Smith       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
346915091d37SBarry Smith       v    += 25;
347015091d37SBarry Smith     }
347115091d37SBarry Smith     v        = aa + 25*diag[i];
3472f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
3473f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
3474f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
3475f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
3476f1af5d2fSBarry Smith     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
347715091d37SBarry Smith   }
347815091d37SBarry Smith 
3479d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
34801ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3481dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
348215091d37SBarry Smith   PetscFunctionReturn(0);
348315091d37SBarry Smith }
348415091d37SBarry Smith 
3485cee9d6f2SShri Abhyankar #undef __FUNCT__
34864dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_5_NaturalOrdering"
34874dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_5_NaturalOrdering(Mat A,Vec bb,Vec xx)
348853cca76cSShri Abhyankar {
348953cca76cSShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3490b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3491b3260449SShri Abhyankar   PetscInt          i,k,nz,idx,idt,jdx;
349253cca76cSShri Abhyankar   PetscErrorCode    ierr;
349353cca76cSShri Abhyankar   const MatScalar   *aa=a->a,*v;
349453cca76cSShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,s5,x1,x2,x3,x4,x5;
349553cca76cSShri Abhyankar   const PetscScalar *b;
349653cca76cSShri Abhyankar 
349753cca76cSShri Abhyankar   PetscFunctionBegin;
349853cca76cSShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
349953cca76cSShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
350053cca76cSShri Abhyankar   /* forward solve the lower triangular */
350153cca76cSShri Abhyankar   idx    = 0;
350253cca76cSShri Abhyankar   x[0] = b[idx]; x[1] = b[1+idx]; x[2] = b[2+idx]; x[3] = b[3+idx];x[4] = b[4+idx];
350353cca76cSShri Abhyankar   for (i=1; i<n; i++) {
350453cca76cSShri Abhyankar     v   = aa + 25*ai[i];
350553cca76cSShri Abhyankar     vi  = aj + ai[i];
350653cca76cSShri Abhyankar     nz  = ai[i+1] - ai[i];
350753cca76cSShri Abhyankar     idx = 5*i;
350853cca76cSShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];s5 = b[4+idx];
350953cca76cSShri Abhyankar     for(k=0;k<nz;k++) {
351053cca76cSShri Abhyankar       jdx   = 5*vi[k];
351153cca76cSShri Abhyankar       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];x5 = x[4+jdx];
351253cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
351353cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
351453cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
351553cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
351653cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
351753cca76cSShri Abhyankar       v    += 25;
351853cca76cSShri Abhyankar     }
351953cca76cSShri Abhyankar     x[idx]   = s1;
352053cca76cSShri Abhyankar     x[1+idx] = s2;
352153cca76cSShri Abhyankar     x[2+idx] = s3;
352253cca76cSShri Abhyankar     x[3+idx] = s4;
352353cca76cSShri Abhyankar     x[4+idx] = s5;
352453cca76cSShri Abhyankar   }
352553cca76cSShri Abhyankar 
352653cca76cSShri Abhyankar   /* backward solve the upper triangular */
352753cca76cSShri Abhyankar   for (i=n-1; i>=0; i--){
352853cca76cSShri Abhyankar     v   = aa + 25*(adiag[i+1]+1);
352953cca76cSShri Abhyankar     vi  = aj + adiag[i+1]+1;
353053cca76cSShri Abhyankar     nz  = adiag[i] - adiag[i+1]-1;
353153cca76cSShri Abhyankar     idt = 5*i;
353253cca76cSShri Abhyankar     s1 = x[idt];  s2 = x[1+idt];
353353cca76cSShri Abhyankar     s3 = x[2+idt];s4 = x[3+idt]; s5 = x[4+idt];
353453cca76cSShri Abhyankar     for(k=0;k<nz;k++){
353553cca76cSShri Abhyankar       idx   = 5*vi[k];
353653cca76cSShri Abhyankar       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx]; x5 = x[4+idx];
353753cca76cSShri Abhyankar       s1 -= v[0]*x1 + v[5]*x2 + v[10]*x3  + v[15]*x4 + v[20]*x5;
353853cca76cSShri Abhyankar       s2 -= v[1]*x1 + v[6]*x2 + v[11]*x3  + v[16]*x4 + v[21]*x5;
353953cca76cSShri Abhyankar       s3 -= v[2]*x1 + v[7]*x2 + v[12]*x3  + v[17]*x4 + v[22]*x5;
354053cca76cSShri Abhyankar       s4 -= v[3]*x1 + v[8]*x2 + v[13]*x3  + v[18]*x4 + v[23]*x5;
354153cca76cSShri Abhyankar       s5 -= v[4]*x1 + v[9]*x2 + v[14]*x3  + v[19]*x4 + v[24]*x5;
354253cca76cSShri Abhyankar       v    += 25;
354353cca76cSShri Abhyankar     }
354453cca76cSShri Abhyankar     /* x = inv_diagonal*x */
354553cca76cSShri Abhyankar     x[idt]   = v[0]*s1 + v[5]*s2 + v[10]*s3  + v[15]*s4 + v[20]*s5;
354653cca76cSShri Abhyankar     x[1+idt] = v[1]*s1 + v[6]*s2 + v[11]*s3  + v[16]*s4 + v[21]*s5;
354753cca76cSShri Abhyankar     x[2+idt] = v[2]*s1 + v[7]*s2 + v[12]*s3  + v[17]*s4 + v[22]*s5;
354853cca76cSShri Abhyankar     x[3+idt] = v[3]*s1 + v[8]*s2 + v[13]*s3  + v[18]*s4 + v[23]*s5;
354953cca76cSShri Abhyankar     x[4+idt] = v[4]*s1 + v[9]*s2 + v[14]*s3  + v[19]*s4 + v[24]*s5;
355053cca76cSShri Abhyankar   }
355153cca76cSShri Abhyankar 
355253cca76cSShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
355353cca76cSShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
355453cca76cSShri Abhyankar   ierr = PetscLogFlops(2.0*25*(a->nz) - 5.0*A->cmap->n);CHKERRQ(ierr);
355553cca76cSShri Abhyankar   PetscFunctionReturn(0);
355653cca76cSShri Abhyankar }
355753cca76cSShri Abhyankar 
355853cca76cSShri Abhyankar #undef __FUNCT__
355906e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_inplace"
356006e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_inplace(Mat A,Vec bb,Vec xx)
35614e2b4712SSatish Balay {
35624e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
35634e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
35646849ba73SBarry Smith   PetscErrorCode    ierr;
3565b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3566b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
35675d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3568d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3569d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
3570d9fead3dSBarry Smith   const PetscScalar *b;
35714e2b4712SSatish Balay 
35724e2b4712SSatish Balay   PetscFunctionBegin;
3573d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
35741ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3575f1af5d2fSBarry Smith   t  = a->solve_work;
35764e2b4712SSatish Balay 
35774e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
35784e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
35794e2b4712SSatish Balay 
35804e2b4712SSatish Balay   /* forward solve the lower triangular */
35814e2b4712SSatish Balay   idx    = 4*(*r++);
3582f1af5d2fSBarry Smith   t[0] = b[idx];   t[1] = b[1+idx];
3583f1af5d2fSBarry Smith   t[2] = b[2+idx]; t[3] = b[3+idx];
35844e2b4712SSatish Balay   for (i=1; i<n; i++) {
35854e2b4712SSatish Balay     v     = aa + 16*ai[i];
35864e2b4712SSatish Balay     vi    = aj + ai[i];
35874e2b4712SSatish Balay     nz    = diag[i] - ai[i];
35884e2b4712SSatish Balay     idx   = 4*(*r++);
3589f1af5d2fSBarry Smith     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
35904e2b4712SSatish Balay     while (nz--) {
35914e2b4712SSatish Balay       idx   = 4*(*vi++);
3592f1af5d2fSBarry Smith       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
3593f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3594f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3595f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3596f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
35974e2b4712SSatish Balay       v    += 16;
35984e2b4712SSatish Balay     }
35994e2b4712SSatish Balay     idx        = 4*i;
3600f1af5d2fSBarry Smith     t[idx]   = s1;t[1+idx] = s2;
3601f1af5d2fSBarry Smith     t[2+idx] = s3;t[3+idx] = s4;
36024e2b4712SSatish Balay   }
36034e2b4712SSatish Balay   /* backward solve the upper triangular */
36044e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
36054e2b4712SSatish Balay     v    = aa + 16*diag[i] + 16;
36064e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
36074e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
36084e2b4712SSatish Balay     idt  = 4*i;
3609f1af5d2fSBarry Smith     s1 = t[idt];  s2 = t[1+idt];
3610f1af5d2fSBarry Smith     s3 = t[2+idt];s4 = t[3+idt];
36114e2b4712SSatish Balay     while (nz--) {
36124e2b4712SSatish Balay       idx   = 4*(*vi++);
3613f1af5d2fSBarry Smith       x1    = t[idx];   x2 = t[1+idx];
3614f1af5d2fSBarry Smith       x3    = t[2+idx]; x4 = t[3+idx];
3615f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3616f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3617f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3618f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
36194e2b4712SSatish Balay       v += 16;
36204e2b4712SSatish Balay     }
36214e2b4712SSatish Balay     idc      = 4*(*c--);
36224e2b4712SSatish Balay     v        = aa + 16*diag[i];
3623f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3624f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3625f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3626f1af5d2fSBarry Smith     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
36274e2b4712SSatish Balay   }
36284e2b4712SSatish Balay 
36294e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
36304e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3631d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
36321ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3633dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
36344e2b4712SSatish Balay   PetscFunctionReturn(0);
36354e2b4712SSatish Balay }
3636f26ec98cSKris Buschelman 
36378f690400SShri Abhyankar #undef __FUNCT__
36384dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4"
36394dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4(Mat A,Vec bb,Vec xx)
364078bb4007SShri Abhyankar {
364178bb4007SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
364278bb4007SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
364378bb4007SShri Abhyankar   PetscErrorCode    ierr;
3644b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
3645b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
364678bb4007SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
364778bb4007SShri Abhyankar   const MatScalar   *aa=a->a,*v;
364878bb4007SShri Abhyankar   PetscScalar       *x,s1,s2,s3,s4,x1,x2,x3,x4,*t;
364978bb4007SShri Abhyankar   const PetscScalar *b;
365078bb4007SShri Abhyankar 
365178bb4007SShri Abhyankar   PetscFunctionBegin;
365278bb4007SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
365378bb4007SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
365478bb4007SShri Abhyankar   t  = a->solve_work;
365578bb4007SShri Abhyankar 
365678bb4007SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
365778bb4007SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
365878bb4007SShri Abhyankar 
365978bb4007SShri Abhyankar   /* forward solve the lower triangular */
366078bb4007SShri Abhyankar   idx    = 4*r[0];
366178bb4007SShri Abhyankar   t[0] = b[idx];   t[1] = b[1+idx];
366278bb4007SShri Abhyankar   t[2] = b[2+idx]; t[3] = b[3+idx];
366378bb4007SShri Abhyankar   for (i=1; i<n; i++) {
366478bb4007SShri Abhyankar     v     = aa + 16*ai[i];
366578bb4007SShri Abhyankar     vi    = aj + ai[i];
366678bb4007SShri Abhyankar     nz    = ai[i+1] - ai[i];
366778bb4007SShri Abhyankar     idx   = 4*r[i];
366878bb4007SShri Abhyankar     s1  = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
366978bb4007SShri Abhyankar     for(m=0;m<nz;m++){
367078bb4007SShri Abhyankar       idx   = 4*vi[m];
367178bb4007SShri Abhyankar       x1    = t[idx];x2 = t[1+idx];x3 = t[2+idx];x4 = t[3+idx];
367278bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
367378bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
367478bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
367578bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
367678bb4007SShri Abhyankar       v    += 16;
367778bb4007SShri Abhyankar     }
367878bb4007SShri Abhyankar     idx        = 4*i;
367978bb4007SShri Abhyankar     t[idx]   = s1;t[1+idx] = s2;
368078bb4007SShri Abhyankar     t[2+idx] = s3;t[3+idx] = s4;
368178bb4007SShri Abhyankar   }
368278bb4007SShri Abhyankar   /* backward solve the upper triangular */
368378bb4007SShri Abhyankar   for (i=n-1; i>=0; i--){
368478bb4007SShri Abhyankar     v    = aa + 16*(adiag[i+1]+1);
368578bb4007SShri Abhyankar     vi   = aj + adiag[i+1]+1;
368678bb4007SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
368778bb4007SShri Abhyankar     idt  = 4*i;
368878bb4007SShri Abhyankar     s1 = t[idt];  s2 = t[1+idt];
368978bb4007SShri Abhyankar     s3 = t[2+idt];s4 = t[3+idt];
369078bb4007SShri Abhyankar     for(m=0;m<nz;m++){
369178bb4007SShri Abhyankar       idx   = 4*vi[m];
369278bb4007SShri Abhyankar       x1    = t[idx];   x2 = t[1+idx];
369378bb4007SShri Abhyankar       x3    = t[2+idx]; x4 = t[3+idx];
369478bb4007SShri Abhyankar       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
369578bb4007SShri Abhyankar       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
369678bb4007SShri Abhyankar       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
369778bb4007SShri Abhyankar       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
369878bb4007SShri Abhyankar       v += 16;
369978bb4007SShri Abhyankar     }
370078bb4007SShri Abhyankar     idc      = 4*c[i];
370178bb4007SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
370278bb4007SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
370378bb4007SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
370478bb4007SShri Abhyankar     x[3+idc] = t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
370578bb4007SShri Abhyankar   }
370678bb4007SShri Abhyankar 
370778bb4007SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
370878bb4007SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
370978bb4007SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
371078bb4007SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
371178bb4007SShri Abhyankar   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
371278bb4007SShri Abhyankar   PetscFunctionReturn(0);
371378bb4007SShri Abhyankar }
371478bb4007SShri Abhyankar 
371578bb4007SShri Abhyankar #undef __FUNCT__
3716f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_Demotion"
3717dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_Demotion(Mat A,Vec bb,Vec xx)
3718f26ec98cSKris Buschelman {
3719f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
3720f26ec98cSKris Buschelman   IS                iscol=a->col,isrow=a->row;
37216849ba73SBarry Smith   PetscErrorCode    ierr;
3722b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
3723b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
37245d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
3725d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
3726d9fead3dSBarry Smith   MatScalar         s1,s2,s3,s4,x1,x2,x3,x4,*t;
3727d9fead3dSBarry Smith   PetscScalar       *x;
3728d9fead3dSBarry Smith   const PetscScalar *b;
3729f26ec98cSKris Buschelman 
3730f26ec98cSKris Buschelman   PetscFunctionBegin;
3731d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
37321ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
3733f26ec98cSKris Buschelman   t  = (MatScalar *)a->solve_work;
3734f26ec98cSKris Buschelman 
3735f26ec98cSKris Buschelman   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
3736f26ec98cSKris Buschelman   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
3737f26ec98cSKris Buschelman 
3738f26ec98cSKris Buschelman   /* forward solve the lower triangular */
3739f26ec98cSKris Buschelman   idx    = 4*(*r++);
3740f26ec98cSKris Buschelman   t[0] = (MatScalar)b[idx];
3741f26ec98cSKris Buschelman   t[1] = (MatScalar)b[1+idx];
3742f26ec98cSKris Buschelman   t[2] = (MatScalar)b[2+idx];
3743f26ec98cSKris Buschelman   t[3] = (MatScalar)b[3+idx];
3744f26ec98cSKris Buschelman   for (i=1; i<n; i++) {
3745f26ec98cSKris Buschelman     v     = aa + 16*ai[i];
3746f26ec98cSKris Buschelman     vi    = aj + ai[i];
3747f26ec98cSKris Buschelman     nz    = diag[i] - ai[i];
3748f26ec98cSKris Buschelman     idx   = 4*(*r++);
3749f26ec98cSKris Buschelman     s1 = (MatScalar)b[idx];
3750f26ec98cSKris Buschelman     s2 = (MatScalar)b[1+idx];
3751f26ec98cSKris Buschelman     s3 = (MatScalar)b[2+idx];
3752f26ec98cSKris Buschelman     s4 = (MatScalar)b[3+idx];
3753f26ec98cSKris Buschelman     while (nz--) {
3754f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3755f26ec98cSKris Buschelman       x1  = t[idx];
3756f26ec98cSKris Buschelman       x2  = t[1+idx];
3757f26ec98cSKris Buschelman       x3  = t[2+idx];
3758f26ec98cSKris Buschelman       x4  = t[3+idx];
3759f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
3760f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
3761f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
3762f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
3763f26ec98cSKris Buschelman       v    += 16;
3764f26ec98cSKris Buschelman     }
3765f26ec98cSKris Buschelman     idx        = 4*i;
3766f26ec98cSKris Buschelman     t[idx]   = s1;
3767f26ec98cSKris Buschelman     t[1+idx] = s2;
3768f26ec98cSKris Buschelman     t[2+idx] = s3;
3769f26ec98cSKris Buschelman     t[3+idx] = s4;
3770f26ec98cSKris Buschelman   }
3771f26ec98cSKris Buschelman   /* backward solve the upper triangular */
3772f26ec98cSKris Buschelman   for (i=n-1; i>=0; i--){
3773f26ec98cSKris Buschelman     v    = aa + 16*diag[i] + 16;
3774f26ec98cSKris Buschelman     vi   = aj + diag[i] + 1;
3775f26ec98cSKris Buschelman     nz   = ai[i+1] - diag[i] - 1;
3776f26ec98cSKris Buschelman     idt  = 4*i;
3777f26ec98cSKris Buschelman     s1 = t[idt];
3778f26ec98cSKris Buschelman     s2 = t[1+idt];
3779f26ec98cSKris Buschelman     s3 = t[2+idt];
3780f26ec98cSKris Buschelman     s4 = t[3+idt];
3781f26ec98cSKris Buschelman     while (nz--) {
3782f26ec98cSKris Buschelman       idx   = 4*(*vi++);
3783f26ec98cSKris Buschelman       x1  = t[idx];
3784f26ec98cSKris Buschelman       x2  = t[1+idx];
3785f26ec98cSKris Buschelman       x3  = t[2+idx];
3786f26ec98cSKris Buschelman       x4  = t[3+idx];
3787f26ec98cSKris Buschelman       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
3788f26ec98cSKris Buschelman       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
3789f26ec98cSKris Buschelman       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
3790f26ec98cSKris Buschelman       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
3791f26ec98cSKris Buschelman       v += 16;
3792f26ec98cSKris Buschelman     }
3793f26ec98cSKris Buschelman     idc      = 4*(*c--);
3794f26ec98cSKris Buschelman     v        = aa + 16*diag[i];
3795f26ec98cSKris Buschelman     t[idt]   = v[0]*s1+v[4]*s2+v[8]*s3+v[12]*s4;
3796f26ec98cSKris Buschelman     t[1+idt] = v[1]*s1+v[5]*s2+v[9]*s3+v[13]*s4;
3797f26ec98cSKris Buschelman     t[2+idt] = v[2]*s1+v[6]*s2+v[10]*s3+v[14]*s4;
3798f26ec98cSKris Buschelman     t[3+idt] = v[3]*s1+v[7]*s2+v[11]*s3+v[15]*s4;
3799f26ec98cSKris Buschelman     x[idc]   = (PetscScalar)t[idt];
3800f26ec98cSKris Buschelman     x[1+idc] = (PetscScalar)t[1+idt];
3801f26ec98cSKris Buschelman     x[2+idc] = (PetscScalar)t[2+idt];
3802f26ec98cSKris Buschelman     x[3+idc] = (PetscScalar)t[3+idt];
3803f26ec98cSKris Buschelman  }
3804f26ec98cSKris Buschelman 
3805f26ec98cSKris Buschelman   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
3806f26ec98cSKris Buschelman   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
3807d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
38081ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
3809dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
3810f26ec98cSKris Buschelman   PetscFunctionReturn(0);
3811f26ec98cSKris Buschelman }
3812f26ec98cSKris Buschelman 
381324c233c2SKris Buschelman #if defined (PETSC_HAVE_SSE)
381424c233c2SKris Buschelman 
381524c233c2SKris Buschelman #include PETSC_HAVE_SSE
381624c233c2SKris Buschelman 
381724c233c2SKris Buschelman #undef __FUNCT__
381824c233c2SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_SSE_Demotion"
3819dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_SSE_Demotion(Mat A,Vec bb,Vec xx)
382024c233c2SKris Buschelman {
382124c233c2SKris Buschelman   /*
382224c233c2SKris Buschelman      Note: This code uses demotion of double
382324c233c2SKris Buschelman      to float when performing the mixed-mode computation.
382424c233c2SKris Buschelman      This may not be numerically reasonable for all applications.
382524c233c2SKris Buschelman   */
382624c233c2SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
382724c233c2SKris Buschelman   IS             iscol=a->col,isrow=a->row;
38286849ba73SBarry Smith   PetscErrorCode ierr;
38295d0c19d7SBarry Smith   PetscInt       i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,nz,idx,idt,idc,ai16;
38305d0c19d7SBarry Smith   const PetscInt *r,*c,*diag = a->diag,*rout,*cout;
383124c233c2SKris Buschelman   MatScalar      *aa=a->a,*v;
383287828ca2SBarry Smith   PetscScalar    *x,*b,*t;
383324c233c2SKris Buschelman 
383424c233c2SKris Buschelman   /* Make space in temp stack for 16 Byte Aligned arrays */
383524c233c2SKris Buschelman   float           ssealignedspace[11],*tmps,*tmpx;
383624c233c2SKris Buschelman   unsigned long   offset;
383724c233c2SKris Buschelman 
383824c233c2SKris Buschelman   PetscFunctionBegin;
383924c233c2SKris Buschelman   SSE_SCOPE_BEGIN;
384024c233c2SKris Buschelman 
384124c233c2SKris Buschelman     offset = (unsigned long)ssealignedspace % 16;
384224c233c2SKris Buschelman     if (offset) offset = (16 - offset)/4;
384324c233c2SKris Buschelman     tmps = &ssealignedspace[offset];
384424c233c2SKris Buschelman     tmpx = &ssealignedspace[offset+4];
384524c233c2SKris Buschelman     PREFETCH_NTA(aa+16*ai[1]);
384624c233c2SKris Buschelman 
38471ebc52fbSHong Zhang     ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
38481ebc52fbSHong Zhang     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
384924c233c2SKris Buschelman     t  = a->solve_work;
385024c233c2SKris Buschelman 
385124c233c2SKris Buschelman     ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
385224c233c2SKris Buschelman     ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
385324c233c2SKris Buschelman 
385424c233c2SKris Buschelman     /* forward solve the lower triangular */
385524c233c2SKris Buschelman     idx  = 4*(*r++);
385624c233c2SKris Buschelman     t[0] = b[idx];   t[1] = b[1+idx];
385724c233c2SKris Buschelman     t[2] = b[2+idx]; t[3] = b[3+idx];
385824c233c2SKris Buschelman     v    =  aa + 16*ai[1];
385924c233c2SKris Buschelman 
386024c233c2SKris Buschelman     for (i=1; i<n;) {
386124c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
386224c233c2SKris Buschelman       vi   =  aj      + ai[i];
386324c233c2SKris Buschelman       nz   =  diag[i] - ai[i];
386424c233c2SKris Buschelman       idx  =  4*(*r++);
386524c233c2SKris Buschelman 
386624c233c2SKris Buschelman       /* Demote sum from double to float */
386724c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&b[idx]);
386824c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
386924c233c2SKris Buschelman 
387024c233c2SKris Buschelman       while (nz--) {
387124c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
387224c233c2SKris Buschelman         idx = 4*(*vi++);
387324c233c2SKris Buschelman 
387424c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
387524c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&x[idx]);
387624c233c2SKris Buschelman 
387724c233c2SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
387824c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
387924c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
388024c233c2SKris Buschelman 
388124c233c2SKris Buschelman           /* First Column */
388224c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
388324c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
388424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
388524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
388624c233c2SKris Buschelman 
388724c233c2SKris Buschelman           /* Second Column */
388824c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
388924c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
389024c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
389124c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
389224c233c2SKris Buschelman 
389324c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
389424c233c2SKris Buschelman 
389524c233c2SKris Buschelman           /* Third Column */
389624c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
389724c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
389824c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
389924c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
390024c233c2SKris Buschelman 
390124c233c2SKris Buschelman           /* Fourth Column */
390224c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
390324c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
390424c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
390524c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
390624c233c2SKris Buschelman         SSE_INLINE_END_2
390724c233c2SKris Buschelman 
390824c233c2SKris Buschelman         v  += 16;
390924c233c2SKris Buschelman       }
391024c233c2SKris Buschelman       idx = 4*i;
391124c233c2SKris Buschelman       v   = aa + 16*ai[++i];
391224c233c2SKris Buschelman       PREFETCH_NTA(v);
391324c233c2SKris Buschelman       STORE_PS(tmps,XMM7);
391424c233c2SKris Buschelman 
391524c233c2SKris Buschelman       /* Promote result from float to double */
391624c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idx],tmps);
391724c233c2SKris Buschelman     }
391824c233c2SKris Buschelman     /* backward solve the upper triangular */
391924c233c2SKris Buschelman     idt  = 4*(n-1);
392024c233c2SKris Buschelman     ai16 = 16*diag[n-1];
392124c233c2SKris Buschelman     v    = aa + ai16 + 16;
392224c233c2SKris Buschelman     for (i=n-1; i>=0;){
392324c233c2SKris Buschelman       PREFETCH_NTA(&v[8]);
392424c233c2SKris Buschelman       vi = aj + diag[i] + 1;
392524c233c2SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
392624c233c2SKris Buschelman 
392724c233c2SKris Buschelman       /* Demote accumulator from double to float */
392824c233c2SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(tmps,&t[idt]);
392924c233c2SKris Buschelman       LOAD_PS(tmps,XMM7);
393024c233c2SKris Buschelman 
393124c233c2SKris Buschelman       while (nz--) {
393224c233c2SKris Buschelman         PREFETCH_NTA(&v[16]);
393324c233c2SKris Buschelman         idx = 4*(*vi++);
393424c233c2SKris Buschelman 
393524c233c2SKris Buschelman         /* Demote solution (so far) from double to float */
393624c233c2SKris Buschelman         CONVERT_DOUBLE4_FLOAT4(tmpx,&t[idx]);
393724c233c2SKris Buschelman 
393824c233c2SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
393924c233c2SKris Buschelman         SSE_INLINE_BEGIN_2(tmpx,v)
394024c233c2SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
394124c233c2SKris Buschelman 
394224c233c2SKris Buschelman           /* First Column */
394324c233c2SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
394424c233c2SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
394524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
394624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
394724c233c2SKris Buschelman 
394824c233c2SKris Buschelman           /* Second Column */
394924c233c2SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
395024c233c2SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
395124c233c2SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
395224c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
395324c233c2SKris Buschelman 
395424c233c2SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
395524c233c2SKris Buschelman 
395624c233c2SKris Buschelman           /* Third Column */
395724c233c2SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
395824c233c2SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
395924c233c2SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
396024c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
396124c233c2SKris Buschelman 
396224c233c2SKris Buschelman           /* Fourth Column */
396324c233c2SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
396424c233c2SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
396524c233c2SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
396624c233c2SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
396724c233c2SKris Buschelman         SSE_INLINE_END_2
396824c233c2SKris Buschelman         v  += 16;
396924c233c2SKris Buschelman       }
397024c233c2SKris Buschelman       v    = aa + ai16;
397124c233c2SKris Buschelman       ai16 = 16*diag[--i];
397224c233c2SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
397324c233c2SKris Buschelman       /*
397424c233c2SKris Buschelman          Scale the result by the diagonal 4x4 block,
397524c233c2SKris Buschelman          which was inverted as part of the factorization
397624c233c2SKris Buschelman       */
397724c233c2SKris Buschelman       SSE_INLINE_BEGIN_3(v,tmps,aa+ai16)
397824c233c2SKris Buschelman         /* First Column */
397924c233c2SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
398024c233c2SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
398124c233c2SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
398224c233c2SKris Buschelman 
398324c233c2SKris Buschelman         /* Second Column */
398424c233c2SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
398524c233c2SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
398624c233c2SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
398724c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
398824c233c2SKris Buschelman 
398924c233c2SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
399024c233c2SKris Buschelman 
399124c233c2SKris Buschelman         /* Third Column */
399224c233c2SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
399324c233c2SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
399424c233c2SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
399524c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
399624c233c2SKris Buschelman 
399724c233c2SKris Buschelman         /* Fourth Column */
399824c233c2SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
399924c233c2SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
400024c233c2SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
400124c233c2SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
400224c233c2SKris Buschelman 
400324c233c2SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
400424c233c2SKris Buschelman       SSE_INLINE_END_3
400524c233c2SKris Buschelman 
400624c233c2SKris Buschelman       /* Promote solution from float to double */
400724c233c2SKris Buschelman       CONVERT_FLOAT4_DOUBLE4(&t[idt],tmps);
400824c233c2SKris Buschelman 
400924c233c2SKris Buschelman       /* Apply reordering to t and stream into x.    */
401024c233c2SKris Buschelman       /* This way, x doesn't pollute the cache.      */
401124c233c2SKris Buschelman       /* Be careful with size: 2 doubles = 4 floats! */
401224c233c2SKris Buschelman       idc  = 4*(*c--);
401324c233c2SKris Buschelman       SSE_INLINE_BEGIN_2((float *)&t[idt],(float *)&x[idc])
401424c233c2SKris Buschelman         /*  x[idc]   = t[idt];   x[1+idc] = t[1+idc]; */
401524c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM0)
401624c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_0,XMM0)
401724c233c2SKris Buschelman         /*  x[idc+2] = t[idt+2]; x[3+idc] = t[3+idc]; */
401824c233c2SKris Buschelman         SSE_LOAD_PS(SSE_ARG_1,FLOAT_4,XMM1)
401924c233c2SKris Buschelman         SSE_STREAM_PS(SSE_ARG_2,FLOAT_4,XMM1)
402024c233c2SKris Buschelman       SSE_INLINE_END_2
402124c233c2SKris Buschelman       v    = aa + ai16 + 16;
402224c233c2SKris Buschelman       idt -= 4;
402324c233c2SKris Buschelman     }
402424c233c2SKris Buschelman 
402524c233c2SKris Buschelman     ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
402624c233c2SKris Buschelman     ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
40271ebc52fbSHong Zhang     ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
40281ebc52fbSHong Zhang     ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4029dc0b31edSSatish Balay     ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
403024c233c2SKris Buschelman   SSE_SCOPE_END;
403124c233c2SKris Buschelman   PetscFunctionReturn(0);
403224c233c2SKris Buschelman }
403324c233c2SKris Buschelman 
403424c233c2SKris Buschelman #endif
40350ef38995SBarry Smith 
40360ef38995SBarry Smith 
40374e2b4712SSatish Balay /*
40384e2b4712SSatish Balay       Special case where the matrix was ILU(0) factored in the natural
40394e2b4712SSatish Balay    ordering. This eliminates the need for the column and row permutation.
40404e2b4712SSatish Balay */
40414a2ae208SSatish Balay #undef __FUNCT__
404206e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_inplace"
404306e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
40444e2b4712SSatish Balay {
40454e2b4712SSatish Balay   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4046356650c2SBarry Smith   PetscInt          n=a->mbs;
4047356650c2SBarry Smith   const PetscInt    *ai=a->i,*aj=a->j;
4048dfbe8321SBarry Smith   PetscErrorCode    ierr;
4049356650c2SBarry Smith   const PetscInt    *diag = a->diag;
4050d9fead3dSBarry Smith   const MatScalar   *aa=a->a;
4051d9fead3dSBarry Smith   PetscScalar       *x;
4052d9fead3dSBarry Smith   const PetscScalar *b;
40534e2b4712SSatish Balay 
40544e2b4712SSatish Balay   PetscFunctionBegin;
4055d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
40561ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
40574e2b4712SSatish Balay 
4058aa482453SBarry Smith #if defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJBLAS)
40592853dc0eSBarry Smith   {
406087828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
40612853dc0eSBarry Smith     fortransolvebaij4blas_(&n,x,ai,aj,diag,aa,b,w);
40622853dc0eSBarry Smith   }
4063aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJ)
40642853dc0eSBarry Smith   {
406587828ca2SBarry Smith     static PetscScalar w[2000]; /* very BAD need to fix */
40662853dc0eSBarry Smith     fortransolvebaij4_(&n,x,ai,aj,diag,aa,b,w);
40672853dc0eSBarry Smith   }
4068aa482453SBarry Smith #elif defined(PETSC_USE_FORTRAN_KERNEL_SOLVEBAIJUNROLL)
40692853dc0eSBarry Smith   fortransolvebaij4unroll_(&n,x,ai,aj,diag,aa,b);
4070e1293385SBarry Smith #else
407130d4dcafSBarry Smith   {
407287828ca2SBarry Smith     PetscScalar     s1,s2,s3,s4,x1,x2,x3,x4;
4073d9fead3dSBarry Smith     const MatScalar *v;
4074356650c2SBarry Smith     PetscInt        jdx,idt,idx,nz,i,ai16;
4075356650c2SBarry Smith     const PetscInt  *vi;
4076e1293385SBarry Smith 
40774e2b4712SSatish Balay   /* forward solve the lower triangular */
40784e2b4712SSatish Balay   idx    = 0;
4079e1293385SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2]; x[3] = b[3];
40804e2b4712SSatish Balay   for (i=1; i<n; i++) {
40814e2b4712SSatish Balay     v     =  aa      + 16*ai[i];
40824e2b4712SSatish Balay     vi    =  aj      + ai[i];
40834e2b4712SSatish Balay     nz    =  diag[i] - ai[i];
4084e1293385SBarry Smith     idx   +=  4;
4085f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
40864e2b4712SSatish Balay     while (nz--) {
40874e2b4712SSatish Balay       jdx   = 4*(*vi++);
40884e2b4712SSatish Balay       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];x4 = x[3+jdx];
4089f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4090f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4091f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4092f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
40934e2b4712SSatish Balay       v    += 16;
40944e2b4712SSatish Balay     }
4095f1af5d2fSBarry Smith     x[idx]   = s1;
4096f1af5d2fSBarry Smith     x[1+idx] = s2;
4097f1af5d2fSBarry Smith     x[2+idx] = s3;
4098f1af5d2fSBarry Smith     x[3+idx] = s4;
40994e2b4712SSatish Balay   }
41004e2b4712SSatish Balay   /* backward solve the upper triangular */
41014e555682SBarry Smith   idt = 4*(n-1);
41024e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
41034e555682SBarry Smith     ai16 = 16*diag[i];
41044e555682SBarry Smith     v    = aa + ai16 + 16;
41054e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
41064e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
4107f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4108f1af5d2fSBarry Smith     s3 = x[2+idt];s4 = x[3+idt];
41094e2b4712SSatish Balay     while (nz--) {
41104e2b4712SSatish Balay       idx   = 4*(*vi++);
41114e2b4712SSatish Balay       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx]; x4 = x[3+idx];
4112f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3   + v[12]*x4;
4113f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3   + v[13]*x4;
4114f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3  + v[14]*x4;
4115f1af5d2fSBarry Smith       s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3  + v[15]*x4;
41164e2b4712SSatish Balay       v    += 16;
41174e2b4712SSatish Balay     }
41184e555682SBarry Smith     v        = aa + ai16;
4119f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4;
4120f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4;
4121f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4122f1af5d2fSBarry Smith     x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4123329f5518SBarry Smith     idt -= 4;
41244e2b4712SSatish Balay   }
412530d4dcafSBarry Smith   }
4126e1293385SBarry Smith #endif
41274e2b4712SSatish Balay 
4128d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
41291ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4130dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
41314e2b4712SSatish Balay   PetscFunctionReturn(0);
41324e2b4712SSatish Balay }
41334e2b4712SSatish Balay 
4134b2b2dd24SShri Abhyankar #undef __FUNCT__
41354dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering"
41364dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering(Mat A,Vec bb,Vec xx)
4137b2b2dd24SShri Abhyankar {
4138b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4139b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4140b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4141b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4142b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4143b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4144b2b2dd24SShri Abhyankar     PetscScalar       *x;
4145b2b2dd24SShri Abhyankar     const PetscScalar *b;
4146b2b2dd24SShri Abhyankar     PetscScalar       s1,s2,s3,s4,x1,x2,x3,x4;
4147cee9d6f2SShri Abhyankar 
4148b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4149b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4150b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4151b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4152b2b2dd24SShri Abhyankar     idx    = 0;
4153b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];x[3] = b[3+idx];
4154b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4155b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4156b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4157b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4158b2b2dd24SShri Abhyankar       idx   = bs*i;
4159b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];s4 = b[3+idx];
4160b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++) {
4161b2b2dd24SShri Abhyankar           jdx   = bs*vi[k];
4162b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];x4 =x[3+jdx];
4163b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4164b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4165b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4166b2b2dd24SShri Abhyankar 	  s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4167b2b2dd24SShri Abhyankar 
4168b2b2dd24SShri Abhyankar           v   +=  bs2;
4169b2b2dd24SShri Abhyankar         }
4170b2b2dd24SShri Abhyankar 
4171b2b2dd24SShri Abhyankar        x[idx]   = s1;
4172b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4173b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4174b2b2dd24SShri Abhyankar        x[3+idx] = s4;
4175b2b2dd24SShri Abhyankar     }
4176b2b2dd24SShri Abhyankar 
4177b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4178b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4179b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4180b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4181b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4182b2b2dd24SShri Abhyankar      idt = bs*i;
4183b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];s4 = x[3+idt];
4184b2b2dd24SShri Abhyankar 
4185b2b2dd24SShri Abhyankar     for(k=0;k<nz;k++){
4186b2b2dd24SShri Abhyankar       idx   = bs*vi[k];
4187b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];x4 = x[3+idx];
4188b2b2dd24SShri Abhyankar        s1   -= v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
4189b2b2dd24SShri Abhyankar        s2   -= v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
4190b2b2dd24SShri Abhyankar        s3   -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4191b2b2dd24SShri Abhyankar        s4   -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4192b2b2dd24SShri Abhyankar 
4193b2b2dd24SShri Abhyankar         v   +=  bs2;
4194b2b2dd24SShri Abhyankar     }
4195b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4196b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[4]*s2 + v[8]*s3 + v[12]*s4;
4197b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[5]*s2 + v[9]*s3 + v[13]*s4;;
4198b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4;
4199b2b2dd24SShri Abhyankar    x[3+idt] = v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4;
4200b2b2dd24SShri Abhyankar 
4201b2b2dd24SShri Abhyankar   }
4202b2b2dd24SShri Abhyankar 
4203b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4204b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4205b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4206b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4207b2b2dd24SShri Abhyankar }
4208cee9d6f2SShri Abhyankar 
4209cee9d6f2SShri Abhyankar #undef __FUNCT__
4210f26ec98cSKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion"
4211dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_Demotion(Mat A,Vec bb,Vec xx)
4212f26ec98cSKris Buschelman {
4213f26ec98cSKris Buschelman   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4214b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j,*diag=a->diag;
4215dfbe8321SBarry Smith   PetscErrorCode    ierr;
4216b3260449SShri Abhyankar   const MatScalar   *aa=a->a;
4217b3260449SShri Abhyankar   const PetscScalar *b;
4218b3260449SShri Abhyankar   PetscScalar       *x;
4219f26ec98cSKris Buschelman 
4220f26ec98cSKris Buschelman   PetscFunctionBegin;
4221b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4223f26ec98cSKris Buschelman 
4224f26ec98cSKris Buschelman   {
4225f26ec98cSKris Buschelman     MatScalar        s1,s2,s3,s4,x1,x2,x3,x4;
4226b3260449SShri Abhyankar     const MatScalar  *v;
4227b3260449SShri Abhyankar     MatScalar        *t=(MatScalar *)x;
4228b3260449SShri Abhyankar     PetscInt         jdx,idt,idx,nz,i,ai16;
4229b3260449SShri Abhyankar     const PetscInt   *vi;
4230f26ec98cSKris Buschelman 
4231f26ec98cSKris Buschelman     /* forward solve the lower triangular */
4232f26ec98cSKris Buschelman     idx  = 0;
4233f26ec98cSKris Buschelman     t[0] = (MatScalar)b[0];
4234f26ec98cSKris Buschelman     t[1] = (MatScalar)b[1];
4235f26ec98cSKris Buschelman     t[2] = (MatScalar)b[2];
4236f26ec98cSKris Buschelman     t[3] = (MatScalar)b[3];
4237f26ec98cSKris Buschelman     for (i=1; i<n; i++) {
4238f26ec98cSKris Buschelman       v     =  aa      + 16*ai[i];
4239f26ec98cSKris Buschelman       vi    =  aj      + ai[i];
4240f26ec98cSKris Buschelman       nz    =  diag[i] - ai[i];
4241f26ec98cSKris Buschelman       idx   +=  4;
4242f26ec98cSKris Buschelman       s1 = (MatScalar)b[idx];
4243f26ec98cSKris Buschelman       s2 = (MatScalar)b[1+idx];
4244f26ec98cSKris Buschelman       s3 = (MatScalar)b[2+idx];
4245f26ec98cSKris Buschelman       s4 = (MatScalar)b[3+idx];
4246f26ec98cSKris Buschelman       while (nz--) {
4247f26ec98cSKris Buschelman         jdx = 4*(*vi++);
4248f26ec98cSKris Buschelman         x1  = t[jdx];
4249f26ec98cSKris Buschelman         x2  = t[1+jdx];
4250f26ec98cSKris Buschelman         x3  = t[2+jdx];
4251f26ec98cSKris Buschelman         x4  = t[3+jdx];
4252f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4253f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4254f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4255f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4256f26ec98cSKris Buschelman         v    += 16;
4257f26ec98cSKris Buschelman       }
4258f26ec98cSKris Buschelman       t[idx]   = s1;
4259f26ec98cSKris Buschelman       t[1+idx] = s2;
4260f26ec98cSKris Buschelman       t[2+idx] = s3;
4261f26ec98cSKris Buschelman       t[3+idx] = s4;
4262f26ec98cSKris Buschelman     }
4263f26ec98cSKris Buschelman     /* backward solve the upper triangular */
4264f26ec98cSKris Buschelman     idt = 4*(n-1);
4265f26ec98cSKris Buschelman     for (i=n-1; i>=0; i--){
4266f26ec98cSKris Buschelman       ai16 = 16*diag[i];
4267f26ec98cSKris Buschelman       v    = aa + ai16 + 16;
4268f26ec98cSKris Buschelman       vi   = aj + diag[i] + 1;
4269f26ec98cSKris Buschelman       nz   = ai[i+1] - diag[i] - 1;
4270f26ec98cSKris Buschelman       s1   = t[idt];
4271f26ec98cSKris Buschelman       s2   = t[1+idt];
4272f26ec98cSKris Buschelman       s3   = t[2+idt];
4273f26ec98cSKris Buschelman       s4   = t[3+idt];
4274f26ec98cSKris Buschelman       while (nz--) {
4275f26ec98cSKris Buschelman         idx = 4*(*vi++);
4276f26ec98cSKris Buschelman         x1  = (MatScalar)x[idx];
4277f26ec98cSKris Buschelman         x2  = (MatScalar)x[1+idx];
4278f26ec98cSKris Buschelman         x3  = (MatScalar)x[2+idx];
4279f26ec98cSKris Buschelman         x4  = (MatScalar)x[3+idx];
4280f26ec98cSKris Buschelman         s1 -= v[0]*x1 + v[4]*x2 + v[8]*x3  + v[12]*x4;
4281f26ec98cSKris Buschelman         s2 -= v[1]*x1 + v[5]*x2 + v[9]*x3  + v[13]*x4;
4282f26ec98cSKris Buschelman         s3 -= v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
4283f26ec98cSKris Buschelman         s4 -= v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
4284f26ec98cSKris Buschelman         v    += 16;
4285f26ec98cSKris Buschelman       }
4286f26ec98cSKris Buschelman       v        = aa + ai16;
4287f26ec98cSKris Buschelman       x[idt]   = (PetscScalar)(v[0]*s1 + v[4]*s2 + v[8]*s3  + v[12]*s4);
4288f26ec98cSKris Buschelman       x[1+idt] = (PetscScalar)(v[1]*s1 + v[5]*s2 + v[9]*s3  + v[13]*s4);
4289f26ec98cSKris Buschelman       x[2+idt] = (PetscScalar)(v[2]*s1 + v[6]*s2 + v[10]*s3 + v[14]*s4);
4290f26ec98cSKris Buschelman       x[3+idt] = (PetscScalar)(v[3]*s1 + v[7]*s2 + v[11]*s3 + v[15]*s4);
4291f26ec98cSKris Buschelman       idt -= 4;
4292f26ec98cSKris Buschelman     }
4293f26ec98cSKris Buschelman   }
4294f26ec98cSKris Buschelman 
4295b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
42961ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4297dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
4298f26ec98cSKris Buschelman   PetscFunctionReturn(0);
4299f26ec98cSKris Buschelman }
4300f26ec98cSKris Buschelman 
43013660e330SKris Buschelman #if defined (PETSC_HAVE_SSE)
43023660e330SKris Buschelman 
43033660e330SKris Buschelman #include PETSC_HAVE_SSE
43043660e330SKris Buschelman #undef __FUNCT__
43057cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj"
4306dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion_usj(Mat A,Vec bb,Vec xx)
43073660e330SKris Buschelman {
43083660e330SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
43092aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)a->j;
4310dfbe8321SBarry Smith   PetscErrorCode ierr;
4311dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
43123660e330SKris Buschelman   MatScalar      *aa=a->a;
431387828ca2SBarry Smith   PetscScalar    *x,*b;
43143660e330SKris Buschelman 
43153660e330SKris Buschelman   PetscFunctionBegin;
43163660e330SKris Buschelman   SSE_SCOPE_BEGIN;
43173660e330SKris Buschelman   /*
43183660e330SKris Buschelman      Note: This code currently uses demotion of double
43193660e330SKris Buschelman      to float when performing the mixed-mode computation.
43203660e330SKris Buschelman      This may not be numerically reasonable for all applications.
43213660e330SKris Buschelman   */
43223660e330SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
43233660e330SKris Buschelman 
43241ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
43251ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
43263660e330SKris Buschelman   {
4327eb05f457SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
4328eb05f457SKris Buschelman     MatScalar      *v,*t=(MatScalar *)x;
43292aa5897fSKris Buschelman     int            nz,i,idt,ai16;
43302aa5897fSKris Buschelman     unsigned int   jdx,idx;
43312aa5897fSKris Buschelman     unsigned short *vi;
4332eb05f457SKris Buschelman     /* Forward solve the lower triangular factor. */
43333660e330SKris Buschelman 
4334eb05f457SKris Buschelman     /* First block is the identity. */
43353660e330SKris Buschelman     idx  = 0;
4336eb05f457SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
43372aa5897fSKris Buschelman     v    =  aa + 16*((unsigned int)ai[1]);
43383660e330SKris Buschelman 
43393660e330SKris Buschelman     for (i=1; i<n;) {
43403660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
43413660e330SKris Buschelman       vi   =  aj      + ai[i];
43423660e330SKris Buschelman       nz   =  diag[i] - ai[i];
43433660e330SKris Buschelman       idx +=  4;
43443660e330SKris Buschelman 
4345eb05f457SKris Buschelman       /* Demote RHS from double to float. */
4346eb05f457SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
4347eb05f457SKris Buschelman       LOAD_PS(&t[idx],XMM7);
43483660e330SKris Buschelman 
43493660e330SKris Buschelman       while (nz--) {
43503660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
43512aa5897fSKris Buschelman         jdx = 4*((unsigned int)(*vi++));
43523660e330SKris Buschelman 
43533660e330SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
4354eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
43553660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
43563660e330SKris Buschelman 
43573660e330SKris Buschelman           /* First Column */
43583660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
43593660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
43603660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
43613660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
43623660e330SKris Buschelman 
43633660e330SKris Buschelman           /* Second Column */
43643660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
43653660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
43663660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
43673660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
43683660e330SKris Buschelman 
43693660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
43703660e330SKris Buschelman 
43713660e330SKris Buschelman           /* Third Column */
43723660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
43733660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
43743660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
43753660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
43763660e330SKris Buschelman 
43773660e330SKris Buschelman           /* Fourth Column */
43783660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
43793660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
43803660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
43813660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
43823660e330SKris Buschelman         SSE_INLINE_END_2
43833660e330SKris Buschelman 
43843660e330SKris Buschelman         v  += 16;
43853660e330SKris Buschelman       }
43863660e330SKris Buschelman       v    =  aa + 16*ai[++i];
43873660e330SKris Buschelman       PREFETCH_NTA(v);
4388eb05f457SKris Buschelman       STORE_PS(&t[idx],XMM7);
43893660e330SKris Buschelman     }
4390eb05f457SKris Buschelman 
4391eb05f457SKris Buschelman     /* Backward solve the upper triangular factor.*/
4392eb05f457SKris Buschelman 
43933660e330SKris Buschelman     idt  = 4*(n-1);
43943660e330SKris Buschelman     ai16 = 16*diag[n-1];
43953660e330SKris Buschelman     v    = aa + ai16 + 16;
43963660e330SKris Buschelman     for (i=n-1; i>=0;){
43973660e330SKris Buschelman       PREFETCH_NTA(&v[8]);
43983660e330SKris Buschelman       vi = aj + diag[i] + 1;
43993660e330SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
44003660e330SKris Buschelman 
4401eb05f457SKris Buschelman       LOAD_PS(&t[idt],XMM7);
44023660e330SKris Buschelman 
44033660e330SKris Buschelman       while (nz--) {
44043660e330SKris Buschelman         PREFETCH_NTA(&v[16]);
44052aa5897fSKris Buschelman         idx = 4*((unsigned int)(*vi++));
44063660e330SKris Buschelman 
44073660e330SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
4408eb05f457SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
44093660e330SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
44103660e330SKris Buschelman 
44113660e330SKris Buschelman           /* First Column */
44123660e330SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
44133660e330SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
44143660e330SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
44153660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
44163660e330SKris Buschelman 
44173660e330SKris Buschelman           /* Second Column */
44183660e330SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
44193660e330SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
44203660e330SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
44213660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
44223660e330SKris Buschelman 
44233660e330SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
44243660e330SKris Buschelman 
44253660e330SKris Buschelman           /* Third Column */
44263660e330SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
44273660e330SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
44283660e330SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
44293660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
44303660e330SKris Buschelman 
44313660e330SKris Buschelman           /* Fourth Column */
44323660e330SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
44333660e330SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
44343660e330SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
44353660e330SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
44363660e330SKris Buschelman         SSE_INLINE_END_2
44373660e330SKris Buschelman         v  += 16;
44383660e330SKris Buschelman       }
44393660e330SKris Buschelman       v    = aa + ai16;
44403660e330SKris Buschelman       ai16 = 16*diag[--i];
44413660e330SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
44423660e330SKris Buschelman       /*
44433660e330SKris Buschelman          Scale the result by the diagonal 4x4 block,
44443660e330SKris Buschelman          which was inverted as part of the factorization
44453660e330SKris Buschelman       */
4446eb05f457SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
44473660e330SKris Buschelman         /* First Column */
44483660e330SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
44493660e330SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
44503660e330SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
44513660e330SKris Buschelman 
44523660e330SKris Buschelman         /* Second Column */
44533660e330SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
44543660e330SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
44553660e330SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
44563660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
44573660e330SKris Buschelman 
44583660e330SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
44593660e330SKris Buschelman 
44603660e330SKris Buschelman         /* Third Column */
44613660e330SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
44623660e330SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
44633660e330SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
44643660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
44653660e330SKris Buschelman 
44663660e330SKris Buschelman         /* Fourth Column */
44673660e330SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
44683660e330SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
44693660e330SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
44703660e330SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
44713660e330SKris Buschelman 
44723660e330SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
44733660e330SKris Buschelman       SSE_INLINE_END_3
44743660e330SKris Buschelman 
44753660e330SKris Buschelman       v    = aa + ai16 + 16;
44763660e330SKris Buschelman       idt -= 4;
44773660e330SKris Buschelman     }
4478eb05f457SKris Buschelman 
4479eb05f457SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
4480eb05f457SKris Buschelman     idt = 4*(n-1);
4481eb05f457SKris Buschelman     for (i=n-1;i>=0;i--) {
4482eb05f457SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
4483eb05f457SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
4484eb05f457SKris Buschelman       PetscScalar *xtemp=&x[idt];
4485eb05f457SKris Buschelman       MatScalar   *ttemp=&t[idt];
4486eb05f457SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
4487eb05f457SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
4488eb05f457SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
4489eb05f457SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
449054693613SKris Buschelman       idt -= 4;
44913660e330SKris Buschelman     }
4492eb05f457SKris Buschelman 
4493eb05f457SKris Buschelman   } /* End of artificial scope. */
44941ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
44951ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4496dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
44973660e330SKris Buschelman   SSE_SCOPE_END;
44983660e330SKris Buschelman   PetscFunctionReturn(0);
44993660e330SKris Buschelman }
45003660e330SKris Buschelman 
45017cf1b8d3SKris Buschelman #undef __FUNCT__
45027cf1b8d3SKris Buschelman #define __FUNCT__ "MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion"
4503dfbe8321SBarry Smith PetscErrorCode MatSolve_SeqBAIJ_4_NaturalOrdering_SSE_Demotion(Mat A,Vec bb,Vec xx)
45047cf1b8d3SKris Buschelman {
45057cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
45067cf1b8d3SKris Buschelman   int            *aj=a->j;
4507dfbe8321SBarry Smith   PetscErrorCode ierr;
4508dfbe8321SBarry Smith   int            *ai=a->i,n=a->mbs,*diag = a->diag;
45097cf1b8d3SKris Buschelman   MatScalar      *aa=a->a;
45107cf1b8d3SKris Buschelman   PetscScalar    *x,*b;
45117cf1b8d3SKris Buschelman 
45127cf1b8d3SKris Buschelman   PetscFunctionBegin;
45137cf1b8d3SKris Buschelman   SSE_SCOPE_BEGIN;
45147cf1b8d3SKris Buschelman   /*
45157cf1b8d3SKris Buschelman      Note: This code currently uses demotion of double
45167cf1b8d3SKris Buschelman      to float when performing the mixed-mode computation.
45177cf1b8d3SKris Buschelman      This may not be numerically reasonable for all applications.
45187cf1b8d3SKris Buschelman   */
45197cf1b8d3SKris Buschelman   PREFETCH_NTA(aa+16*ai[1]);
45207cf1b8d3SKris Buschelman 
45211ebc52fbSHong Zhang   ierr = VecGetArray(bb,&b);CHKERRQ(ierr);
45221ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
45237cf1b8d3SKris Buschelman   {
45247cf1b8d3SKris Buschelman     /* x will first be computed in single precision then promoted inplace to double */
45257cf1b8d3SKris Buschelman     MatScalar *v,*t=(MatScalar *)x;
45267cf1b8d3SKris Buschelman     int       nz,i,idt,ai16;
45277cf1b8d3SKris Buschelman     int       jdx,idx;
45287cf1b8d3SKris Buschelman     int       *vi;
45297cf1b8d3SKris Buschelman     /* Forward solve the lower triangular factor. */
45307cf1b8d3SKris Buschelman 
45317cf1b8d3SKris Buschelman     /* First block is the identity. */
45327cf1b8d3SKris Buschelman     idx  = 0;
45337cf1b8d3SKris Buschelman     CONVERT_DOUBLE4_FLOAT4(t,b);
45347cf1b8d3SKris Buschelman     v    =  aa + 16*ai[1];
45357cf1b8d3SKris Buschelman 
45367cf1b8d3SKris Buschelman     for (i=1; i<n;) {
45377cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
45387cf1b8d3SKris Buschelman       vi   =  aj      + ai[i];
45397cf1b8d3SKris Buschelman       nz   =  diag[i] - ai[i];
45407cf1b8d3SKris Buschelman       idx +=  4;
45417cf1b8d3SKris Buschelman 
45427cf1b8d3SKris Buschelman       /* Demote RHS from double to float. */
45437cf1b8d3SKris Buschelman       CONVERT_DOUBLE4_FLOAT4(&t[idx],&b[idx]);
45447cf1b8d3SKris Buschelman       LOAD_PS(&t[idx],XMM7);
45457cf1b8d3SKris Buschelman 
45467cf1b8d3SKris Buschelman       while (nz--) {
45477cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
45487cf1b8d3SKris Buschelman         jdx = 4*(*vi++);
45497cf1b8d3SKris Buschelman /*          jdx = *vi++; */
45507cf1b8d3SKris Buschelman 
45517cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector product with negative accumulation: */
45527cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[jdx],v)
45537cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
45547cf1b8d3SKris Buschelman 
45557cf1b8d3SKris Buschelman           /* First Column */
45567cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
45577cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
45587cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
45597cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
45607cf1b8d3SKris Buschelman 
45617cf1b8d3SKris Buschelman           /* Second Column */
45627cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
45637cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
45647cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
45657cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
45667cf1b8d3SKris Buschelman 
45677cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
45687cf1b8d3SKris Buschelman 
45697cf1b8d3SKris Buschelman           /* Third Column */
45707cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
45717cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
45727cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
45737cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
45747cf1b8d3SKris Buschelman 
45757cf1b8d3SKris Buschelman           /* Fourth Column */
45767cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
45777cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
45787cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
45797cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
45807cf1b8d3SKris Buschelman         SSE_INLINE_END_2
45817cf1b8d3SKris Buschelman 
45827cf1b8d3SKris Buschelman         v  += 16;
45837cf1b8d3SKris Buschelman       }
45847cf1b8d3SKris Buschelman       v    =  aa + 16*ai[++i];
45857cf1b8d3SKris Buschelman       PREFETCH_NTA(v);
45867cf1b8d3SKris Buschelman       STORE_PS(&t[idx],XMM7);
45877cf1b8d3SKris Buschelman     }
45887cf1b8d3SKris Buschelman 
45897cf1b8d3SKris Buschelman     /* Backward solve the upper triangular factor.*/
45907cf1b8d3SKris Buschelman 
45917cf1b8d3SKris Buschelman     idt  = 4*(n-1);
45927cf1b8d3SKris Buschelman     ai16 = 16*diag[n-1];
45937cf1b8d3SKris Buschelman     v    = aa + ai16 + 16;
45947cf1b8d3SKris Buschelman     for (i=n-1; i>=0;){
45957cf1b8d3SKris Buschelman       PREFETCH_NTA(&v[8]);
45967cf1b8d3SKris Buschelman       vi = aj + diag[i] + 1;
45977cf1b8d3SKris Buschelman       nz = ai[i+1] - diag[i] - 1;
45987cf1b8d3SKris Buschelman 
45997cf1b8d3SKris Buschelman       LOAD_PS(&t[idt],XMM7);
46007cf1b8d3SKris Buschelman 
46017cf1b8d3SKris Buschelman       while (nz--) {
46027cf1b8d3SKris Buschelman         PREFETCH_NTA(&v[16]);
46037cf1b8d3SKris Buschelman         idx = 4*(*vi++);
46047cf1b8d3SKris Buschelman /*          idx = *vi++; */
46057cf1b8d3SKris Buschelman 
46067cf1b8d3SKris Buschelman         /* 4x4 Matrix-Vector Product with negative accumulation: */
46077cf1b8d3SKris Buschelman         SSE_INLINE_BEGIN_2(&t[idx],v)
46087cf1b8d3SKris Buschelman           SSE_LOAD_PS(SSE_ARG_1,FLOAT_0,XMM6)
46097cf1b8d3SKris Buschelman 
46107cf1b8d3SKris Buschelman           /* First Column */
46117cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM0,XMM6)
46127cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM0,XMM0,0x00)
46137cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM0,SSE_ARG_2,FLOAT_0)
46147cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM0)
46157cf1b8d3SKris Buschelman 
46167cf1b8d3SKris Buschelman           /* Second Column */
46177cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM1,XMM6)
46187cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM1,XMM1,0x55)
46197cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM1,SSE_ARG_2,FLOAT_4)
46207cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM1)
46217cf1b8d3SKris Buschelman 
46227cf1b8d3SKris Buschelman           SSE_PREFETCH_NTA(SSE_ARG_2,FLOAT_24)
46237cf1b8d3SKris Buschelman 
46247cf1b8d3SKris Buschelman           /* Third Column */
46257cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM2,XMM6)
46267cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM2,XMM2,0xAA)
46277cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM2,SSE_ARG_2,FLOAT_8)
46287cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM2)
46297cf1b8d3SKris Buschelman 
46307cf1b8d3SKris Buschelman           /* Fourth Column */
46317cf1b8d3SKris Buschelman           SSE_COPY_PS(XMM3,XMM6)
46327cf1b8d3SKris Buschelman           SSE_SHUFFLE(XMM3,XMM3,0xFF)
46337cf1b8d3SKris Buschelman           SSE_MULT_PS_M(XMM3,SSE_ARG_2,FLOAT_12)
46347cf1b8d3SKris Buschelman           SSE_SUB_PS(XMM7,XMM3)
46357cf1b8d3SKris Buschelman         SSE_INLINE_END_2
46367cf1b8d3SKris Buschelman         v  += 16;
46377cf1b8d3SKris Buschelman       }
46387cf1b8d3SKris Buschelman       v    = aa + ai16;
46397cf1b8d3SKris Buschelman       ai16 = 16*diag[--i];
46407cf1b8d3SKris Buschelman       PREFETCH_NTA(aa+ai16+16);
46417cf1b8d3SKris Buschelman       /*
46427cf1b8d3SKris Buschelman          Scale the result by the diagonal 4x4 block,
46437cf1b8d3SKris Buschelman          which was inverted as part of the factorization
46447cf1b8d3SKris Buschelman       */
46457cf1b8d3SKris Buschelman       SSE_INLINE_BEGIN_3(v,&t[idt],aa+ai16)
46467cf1b8d3SKris Buschelman         /* First Column */
46477cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM0,XMM7)
46487cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM0,XMM0,0x00)
46497cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM0,SSE_ARG_1,FLOAT_0)
46507cf1b8d3SKris Buschelman 
46517cf1b8d3SKris Buschelman         /* Second Column */
46527cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM1,XMM7)
46537cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM1,XMM1,0x55)
46547cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM1,SSE_ARG_1,FLOAT_4)
46557cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM1)
46567cf1b8d3SKris Buschelman 
46577cf1b8d3SKris Buschelman         SSE_PREFETCH_NTA(SSE_ARG_3,FLOAT_24)
46587cf1b8d3SKris Buschelman 
46597cf1b8d3SKris Buschelman         /* Third Column */
46607cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM2,XMM7)
46617cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM2,XMM2,0xAA)
46627cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM2,SSE_ARG_1,FLOAT_8)
46637cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM2)
46647cf1b8d3SKris Buschelman 
46657cf1b8d3SKris Buschelman         /* Fourth Column */
46667cf1b8d3SKris Buschelman         SSE_COPY_PS(XMM3,XMM7)
46677cf1b8d3SKris Buschelman         SSE_SHUFFLE(XMM3,XMM3,0xFF)
46687cf1b8d3SKris Buschelman         SSE_MULT_PS_M(XMM3,SSE_ARG_1,FLOAT_12)
46697cf1b8d3SKris Buschelman         SSE_ADD_PS(XMM0,XMM3)
46707cf1b8d3SKris Buschelman 
46717cf1b8d3SKris Buschelman         SSE_STORE_PS(SSE_ARG_2,FLOAT_0,XMM0)
46727cf1b8d3SKris Buschelman       SSE_INLINE_END_3
46737cf1b8d3SKris Buschelman 
46747cf1b8d3SKris Buschelman       v    = aa + ai16 + 16;
46757cf1b8d3SKris Buschelman       idt -= 4;
46767cf1b8d3SKris Buschelman     }
46777cf1b8d3SKris Buschelman 
46787cf1b8d3SKris Buschelman     /* Convert t from single precision back to double precision (inplace)*/
46797cf1b8d3SKris Buschelman     idt = 4*(n-1);
46807cf1b8d3SKris Buschelman     for (i=n-1;i>=0;i--) {
46817cf1b8d3SKris Buschelman       /*     CONVERT_FLOAT4_DOUBLE4(&x[idt],&t[idt]); */
46827cf1b8d3SKris Buschelman       /* Unfortunately, CONVERT_ will count from 0 to 3 which doesn't work here. */
46837cf1b8d3SKris Buschelman       PetscScalar *xtemp=&x[idt];
46847cf1b8d3SKris Buschelman       MatScalar   *ttemp=&t[idt];
46857cf1b8d3SKris Buschelman       xtemp[3] = (PetscScalar)ttemp[3];
46867cf1b8d3SKris Buschelman       xtemp[2] = (PetscScalar)ttemp[2];
46877cf1b8d3SKris Buschelman       xtemp[1] = (PetscScalar)ttemp[1];
46887cf1b8d3SKris Buschelman       xtemp[0] = (PetscScalar)ttemp[0];
46897cf1b8d3SKris Buschelman       idt -= 4;
46907cf1b8d3SKris Buschelman     }
46917cf1b8d3SKris Buschelman 
46927cf1b8d3SKris Buschelman   } /* End of artificial scope. */
46931ebc52fbSHong Zhang   ierr = VecRestoreArray(bb,&b);CHKERRQ(ierr);
46941ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4695dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*16*(a->nz) - 4.0*A->cmap->n);CHKERRQ(ierr);
46967cf1b8d3SKris Buschelman   SSE_SCOPE_END;
46977cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
46987cf1b8d3SKris Buschelman }
46997cf1b8d3SKris Buschelman 
47003660e330SKris Buschelman #endif
47018f690400SShri Abhyankar 
47024a2ae208SSatish Balay #undef __FUNCT__
470306e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_inplace"
470406e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_inplace(Mat A,Vec bb,Vec xx)
47054e2b4712SSatish Balay {
47064e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
47074e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
47086849ba73SBarry Smith   PetscErrorCode    ierr;
4709b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4710b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
47115d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4712d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4713d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
4714d9fead3dSBarry Smith   const PetscScalar *b;
47154e2b4712SSatish Balay 
47164e2b4712SSatish Balay   PetscFunctionBegin;
4717d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47181ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4719f1af5d2fSBarry Smith   t  = a->solve_work;
47204e2b4712SSatish Balay 
47214e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47224e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
47234e2b4712SSatish Balay 
47244e2b4712SSatish Balay   /* forward solve the lower triangular */
47254e2b4712SSatish Balay   idx    = 3*(*r++);
4726f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
47274e2b4712SSatish Balay   for (i=1; i<n; i++) {
47284e2b4712SSatish Balay     v     = aa + 9*ai[i];
47294e2b4712SSatish Balay     vi    = aj + ai[i];
47304e2b4712SSatish Balay     nz    = diag[i] - ai[i];
47314e2b4712SSatish Balay     idx   = 3*(*r++);
4732f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
47334e2b4712SSatish Balay     while (nz--) {
47344e2b4712SSatish Balay       idx   = 3*(*vi++);
4735f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4736f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4737f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4738f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
47394e2b4712SSatish Balay       v += 9;
47404e2b4712SSatish Balay     }
47414e2b4712SSatish Balay     idx = 3*i;
4742f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
47434e2b4712SSatish Balay   }
47444e2b4712SSatish Balay   /* backward solve the upper triangular */
47454e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
47464e2b4712SSatish Balay     v    = aa + 9*diag[i] + 9;
47474e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
47484e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
47494e2b4712SSatish Balay     idt  = 3*i;
4750f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
47514e2b4712SSatish Balay     while (nz--) {
47524e2b4712SSatish Balay       idx   = 3*(*vi++);
4753f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
4754f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4755f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4756f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
47574e2b4712SSatish Balay       v += 9;
47584e2b4712SSatish Balay     }
47594e2b4712SSatish Balay     idc = 3*(*c--);
47604e2b4712SSatish Balay     v   = aa + 9*diag[i];
4761f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4762f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4763f1af5d2fSBarry Smith     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
47644e2b4712SSatish Balay   }
47654e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
47664e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
4767d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47681ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4769dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
47704e2b4712SSatish Balay   PetscFunctionReturn(0);
47714e2b4712SSatish Balay }
47724e2b4712SSatish Balay 
47730c4413a7SShri Abhyankar #undef __FUNCT__
47744dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3"
47754dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3(Mat A,Vec bb,Vec xx)
47760c4413a7SShri Abhyankar {
47770c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
47780c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
47790c4413a7SShri Abhyankar   PetscErrorCode    ierr;
4780b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4781b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc,m;
47820c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
47830c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
47840c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,s3,x1,x2,x3,*t;
47850c4413a7SShri Abhyankar   const PetscScalar *b;
47860c4413a7SShri Abhyankar 
47870c4413a7SShri Abhyankar   PetscFunctionBegin;
47880c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
47890c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
47900c4413a7SShri Abhyankar   t  = a->solve_work;
47910c4413a7SShri Abhyankar 
47920c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
47930c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
47940c4413a7SShri Abhyankar 
47950c4413a7SShri Abhyankar   /* forward solve the lower triangular */
47960c4413a7SShri Abhyankar   idx    = 3*r[0];
47970c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx]; t[2] = b[2+idx];
47980c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
47990c4413a7SShri Abhyankar     v     = aa + 9*ai[i];
48000c4413a7SShri Abhyankar     vi    = aj + ai[i];
48010c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
48020c4413a7SShri Abhyankar     idx   = 3*r[i];
48030c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx]; s3 = b[2+idx];
48040c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
48050c4413a7SShri Abhyankar       idx   = 3*vi[m];
48060c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48070c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48080c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48090c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48100c4413a7SShri Abhyankar       v += 9;
48110c4413a7SShri Abhyankar     }
48120c4413a7SShri Abhyankar     idx = 3*i;
48130c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2; t[2+idx] = s3;
48140c4413a7SShri Abhyankar   }
48150c4413a7SShri Abhyankar   /* backward solve the upper triangular */
48160c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
48170c4413a7SShri Abhyankar     v    = aa + 9*(adiag[i+1]+1);
48180c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
48190c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
48200c4413a7SShri Abhyankar     idt  = 3*i;
48210c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt]; s3 = t[2+idt];
48220c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
48230c4413a7SShri Abhyankar       idx   = 3*vi[m];
48240c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx]; x3 = t[2+idx];
48250c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
48260c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
48270c4413a7SShri Abhyankar       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
48280c4413a7SShri Abhyankar       v += 9;
48290c4413a7SShri Abhyankar     }
48300c4413a7SShri Abhyankar     idc = 3*c[i];
48310c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
48320c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
48330c4413a7SShri Abhyankar     x[2+idc] = t[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
48340c4413a7SShri Abhyankar   }
48350c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
48360c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
48370c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
48380c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
48390c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
48400c4413a7SShri Abhyankar   PetscFunctionReturn(0);
48410c4413a7SShri Abhyankar }
48420c4413a7SShri Abhyankar 
484315091d37SBarry Smith /*
484415091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
484515091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
484615091d37SBarry Smith */
48474a2ae208SSatish Balay #undef __FUNCT__
484806e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering_inplace"
484906e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
485015091d37SBarry Smith {
485115091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
48520b68f018SBarry Smith   const PetscInt    n=a->mbs,*ai=a->i,*aj=a->j;
4853dfbe8321SBarry Smith   PetscErrorCode    ierr;
48540b68f018SBarry Smith   const PetscInt    *diag = a->diag,*vi;
4855d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4856d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,s3,x1,x2,x3;
4857d9fead3dSBarry Smith   const PetscScalar *b;
48580b68f018SBarry Smith   PetscInt          jdx,idt,idx,nz,i;
485915091d37SBarry Smith 
486015091d37SBarry Smith   PetscFunctionBegin;
4861d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
48621ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
486315091d37SBarry Smith 
486415091d37SBarry Smith   /* forward solve the lower triangular */
486515091d37SBarry Smith   idx    = 0;
486615091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1]; x[2] = b[2];
486715091d37SBarry Smith   for (i=1; i<n; i++) {
486815091d37SBarry Smith     v     =  aa      + 9*ai[i];
486915091d37SBarry Smith     vi    =  aj      + ai[i];
487015091d37SBarry Smith     nz    =  diag[i] - ai[i];
487115091d37SBarry Smith     idx   +=  3;
4872f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];s3 = b[2+idx];
487315091d37SBarry Smith     while (nz--) {
487415091d37SBarry Smith       jdx   = 3*(*vi++);
487515091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];x3 = x[2+jdx];
4876f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4877f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4878f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
487915091d37SBarry Smith       v    += 9;
488015091d37SBarry Smith     }
4881f1af5d2fSBarry Smith     x[idx]   = s1;
4882f1af5d2fSBarry Smith     x[1+idx] = s2;
4883f1af5d2fSBarry Smith     x[2+idx] = s3;
488415091d37SBarry Smith   }
488515091d37SBarry Smith   /* backward solve the upper triangular */
488615091d37SBarry Smith   for (i=n-1; i>=0; i--){
488715091d37SBarry Smith     v    = aa + 9*diag[i] + 9;
488815091d37SBarry Smith     vi   = aj + diag[i] + 1;
488915091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
489015091d37SBarry Smith     idt  = 3*i;
4891f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
4892f1af5d2fSBarry Smith     s3 = x[2+idt];
489315091d37SBarry Smith     while (nz--) {
489415091d37SBarry Smith       idx   = 3*(*vi++);
489515091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];x3    = x[2+idx];
4896f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4897f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4898f1af5d2fSBarry Smith       s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
489915091d37SBarry Smith       v    += 9;
490015091d37SBarry Smith     }
490115091d37SBarry Smith     v        = aa +  9*diag[i];
4902f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4903f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4904f1af5d2fSBarry Smith     x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
490515091d37SBarry Smith   }
490615091d37SBarry Smith 
4907d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
49081ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4909dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*9*(a->nz) - 3.0*A->cmap->n);CHKERRQ(ierr);
491015091d37SBarry Smith   PetscFunctionReturn(0);
491115091d37SBarry Smith }
491215091d37SBarry Smith 
4913cee9d6f2SShri Abhyankar #undef __FUNCT__
49144dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_3_NaturalOrdering"
49154dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_3_NaturalOrdering(Mat A,Vec bb,Vec xx)
4916b2b2dd24SShri Abhyankar {
4917b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
4918b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
4919b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
4920b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,jdx,idt;
4921b3260449SShri Abhyankar     const PetscInt    bs = A->rmap->bs,bs2 = a->bs2;
4922b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
4923b2b2dd24SShri Abhyankar     PetscScalar       *x;
4924b2b2dd24SShri Abhyankar     const PetscScalar *b;
4925b2b2dd24SShri Abhyankar     PetscScalar        s1,s2,s3,x1,x2,x3;
4926b2b2dd24SShri Abhyankar 
4927b2b2dd24SShri Abhyankar     PetscFunctionBegin;
4928b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4929b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
4930b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
4931b2b2dd24SShri Abhyankar     idx    = 0;
4932b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];x[2] = b[2+idx];
4933b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
4934b2b2dd24SShri Abhyankar        v    = aa + bs2*ai[i];
4935b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
4936b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
4937b2b2dd24SShri Abhyankar       idx   = bs*i;
4938b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];s3 = b[2+idx];
4939b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
4940b2b2dd24SShri Abhyankar          jdx   = bs*vi[k];
4941b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx]; x3 =x[2+jdx];
4942b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4943b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4944b2b2dd24SShri Abhyankar           s3   -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4945b2b2dd24SShri Abhyankar 
4946b2b2dd24SShri Abhyankar           v   +=  bs2;
4947b2b2dd24SShri Abhyankar         }
4948b2b2dd24SShri Abhyankar 
4949b2b2dd24SShri Abhyankar        x[idx]   = s1;
4950b2b2dd24SShri Abhyankar        x[1+idx] = s2;
4951b2b2dd24SShri Abhyankar        x[2+idx] = s3;
4952b2b2dd24SShri Abhyankar     }
4953b2b2dd24SShri Abhyankar 
4954b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
4955b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
4956b2b2dd24SShri Abhyankar     v   = aa + bs2*(adiag[i+1]+1);
4957b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
4958b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
4959b2b2dd24SShri Abhyankar      idt = bs*i;
4960b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];s3 = x[2+idt];
4961b2b2dd24SShri Abhyankar 
4962b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
4963b2b2dd24SShri Abhyankar        idx   = bs*vi[k];
4964b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx]; x3 = x[2+idx];
4965b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[3]*x2 + v[6]*x3;
4966b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[4]*x2 + v[7]*x3;
4967b2b2dd24SShri Abhyankar        s3 -= v[2]*x1 + v[5]*x2 + v[8]*x3;
4968b2b2dd24SShri Abhyankar 
4969b2b2dd24SShri Abhyankar         v   +=  bs2;
4970b2b2dd24SShri Abhyankar     }
4971b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
4972b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[3]*s2 + v[6]*s3;
4973b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[4]*s2 + v[7]*s3;
4974b2b2dd24SShri Abhyankar    x[2+idt] = v[2]*s1 + v[5]*s2 + v[8]*s3;
4975b2b2dd24SShri Abhyankar 
4976b2b2dd24SShri Abhyankar   }
4977b2b2dd24SShri Abhyankar 
4978b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
4979b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
4980b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*bs2*(a->nz) - bs*A->cmap->n);CHKERRQ(ierr);
4981b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
4982b2b2dd24SShri Abhyankar }
4983b2b2dd24SShri Abhyankar 
4984b2b2dd24SShri Abhyankar #undef __FUNCT__
498506e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_inplace"
498606e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_inplace(Mat A,Vec bb,Vec xx)
49874e2b4712SSatish Balay {
49884e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
49894e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
49906849ba73SBarry Smith   PetscErrorCode    ierr;
4991b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
4992b3260449SShri Abhyankar   PetscInt          i,nz,idx,idt,idc;
49935d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
4994d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
4995d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2,*t;
4996d9fead3dSBarry Smith   const PetscScalar *b;
49974e2b4712SSatish Balay 
49984e2b4712SSatish Balay   PetscFunctionBegin;
4999d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
50001ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5001f1af5d2fSBarry Smith   t  = a->solve_work;
50024e2b4712SSatish Balay 
50034e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
50044e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
50054e2b4712SSatish Balay 
50064e2b4712SSatish Balay   /* forward solve the lower triangular */
50074e2b4712SSatish Balay   idx    = 2*(*r++);
5008f1af5d2fSBarry Smith   t[0] = b[idx]; t[1] = b[1+idx];
50094e2b4712SSatish Balay   for (i=1; i<n; i++) {
50104e2b4712SSatish Balay     v     = aa + 4*ai[i];
50114e2b4712SSatish Balay     vi    = aj + ai[i];
50124e2b4712SSatish Balay     nz    = diag[i] - ai[i];
50134e2b4712SSatish Balay     idx   = 2*(*r++);
5014f1af5d2fSBarry Smith     s1  = b[idx]; s2 = b[1+idx];
50154e2b4712SSatish Balay     while (nz--) {
50164e2b4712SSatish Balay       idx   = 2*(*vi++);
5017f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5018f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5019f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
50204e2b4712SSatish Balay       v += 4;
50214e2b4712SSatish Balay     }
50224e2b4712SSatish Balay     idx = 2*i;
5023f1af5d2fSBarry Smith     t[idx] = s1; t[1+idx] = s2;
50244e2b4712SSatish Balay   }
50254e2b4712SSatish Balay   /* backward solve the upper triangular */
50264e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
50274e2b4712SSatish Balay     v    = aa + 4*diag[i] + 4;
50284e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
50294e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
50304e2b4712SSatish Balay     idt  = 2*i;
5031f1af5d2fSBarry Smith     s1 = t[idt]; s2 = t[1+idt];
50324e2b4712SSatish Balay     while (nz--) {
50334e2b4712SSatish Balay       idx   = 2*(*vi++);
5034f1af5d2fSBarry Smith       x1    = t[idx]; x2 = t[1+idx];
5035f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5036f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
50374e2b4712SSatish Balay       v += 4;
50384e2b4712SSatish Balay     }
50394e2b4712SSatish Balay     idc = 2*(*c--);
50404e2b4712SSatish Balay     v   = aa + 4*diag[i];
5041f1af5d2fSBarry Smith     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
5042f1af5d2fSBarry Smith     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
50434e2b4712SSatish Balay   }
50444e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
50454e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5046d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
50471ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5048dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
50494e2b4712SSatish Balay   PetscFunctionReturn(0);
50504e2b4712SSatish Balay }
50514e2b4712SSatish Balay 
50520c4413a7SShri Abhyankar #undef __FUNCT__
50534dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2"
50544dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2(Mat A,Vec bb,Vec xx)
50550c4413a7SShri Abhyankar {
50560c4413a7SShri Abhyankar   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
50570c4413a7SShri Abhyankar   IS                iscol=a->col,isrow=a->row;
50580c4413a7SShri Abhyankar   PetscErrorCode    ierr;
5059b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5060b3260449SShri Abhyankar   PetscInt          i,nz,idx,jdx,idt,idc,m;
50610c4413a7SShri Abhyankar   const PetscInt    *r,*c,*rout,*cout;
50620c4413a7SShri Abhyankar   const MatScalar   *aa=a->a,*v;
50630c4413a7SShri Abhyankar   PetscScalar       *x,s1,s2,x1,x2,*t;
50640c4413a7SShri Abhyankar   const PetscScalar *b;
50650c4413a7SShri Abhyankar 
50660c4413a7SShri Abhyankar   PetscFunctionBegin;
50670c4413a7SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
50680c4413a7SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
50690c4413a7SShri Abhyankar   t  = a->solve_work;
50700c4413a7SShri Abhyankar 
50710c4413a7SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
50720c4413a7SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
50730c4413a7SShri Abhyankar 
50740c4413a7SShri Abhyankar   /* forward solve the lower triangular */
50750c4413a7SShri Abhyankar   idx    = 2*r[0];
50760c4413a7SShri Abhyankar   t[0] = b[idx]; t[1] = b[1+idx];
50770c4413a7SShri Abhyankar   for (i=1; i<n; i++) {
50780c4413a7SShri Abhyankar     v     = aa + 4*ai[i];
50790c4413a7SShri Abhyankar     vi    = aj + ai[i];
50800c4413a7SShri Abhyankar     nz    = ai[i+1] - ai[i];
50810c4413a7SShri Abhyankar     idx   = 2*r[i];
50820c4413a7SShri Abhyankar     s1  = b[idx]; s2 = b[1+idx];
50830c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
50840c4413a7SShri Abhyankar       jdx   = 2*vi[m];
50850c4413a7SShri Abhyankar       x1    = t[jdx]; x2 = t[1+jdx];
50860c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
50870c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
50880c4413a7SShri Abhyankar       v += 4;
50890c4413a7SShri Abhyankar     }
50900c4413a7SShri Abhyankar     idx = 2*i;
50910c4413a7SShri Abhyankar     t[idx] = s1; t[1+idx] = s2;
50920c4413a7SShri Abhyankar   }
50930c4413a7SShri Abhyankar   /* backward solve the upper triangular */
50940c4413a7SShri Abhyankar   for (i=n-1; i>=0; i--){
50950c4413a7SShri Abhyankar     v    = aa + 4*(adiag[i+1]+1);
50960c4413a7SShri Abhyankar     vi   = aj + adiag[i+1]+1;
50970c4413a7SShri Abhyankar     nz   = adiag[i] - adiag[i+1] - 1;
50980c4413a7SShri Abhyankar     idt  = 2*i;
50990c4413a7SShri Abhyankar     s1 = t[idt]; s2 = t[1+idt];
51000c4413a7SShri Abhyankar     for(m=0;m<nz;m++){
51010c4413a7SShri Abhyankar       idx   = 2*vi[m];
51020c4413a7SShri Abhyankar       x1    = t[idx]; x2 = t[1+idx];
51030c4413a7SShri Abhyankar       s1 -= v[0]*x1 + v[2]*x2;
51040c4413a7SShri Abhyankar       s2 -= v[1]*x1 + v[3]*x2;
51050c4413a7SShri Abhyankar       v += 4;
51060c4413a7SShri Abhyankar     }
51070c4413a7SShri Abhyankar     idc = 2*c[i];
51080c4413a7SShri Abhyankar     x[idc]   = t[idt]   = v[0]*s1 + v[2]*s2;
51090c4413a7SShri Abhyankar     x[1+idc] = t[1+idt] = v[1]*s1 + v[3]*s2;
51100c4413a7SShri Abhyankar   }
51110c4413a7SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
51120c4413a7SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
51130c4413a7SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
51140c4413a7SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
51150c4413a7SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
51160c4413a7SShri Abhyankar   PetscFunctionReturn(0);
51170c4413a7SShri Abhyankar }
51188f690400SShri Abhyankar 
511915091d37SBarry Smith /*
512015091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
512115091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
512215091d37SBarry Smith */
51234a2ae208SSatish Balay #undef __FUNCT__
512406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering_inplace"
512506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
512615091d37SBarry Smith {
512715091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5128b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5129dfbe8321SBarry Smith   PetscErrorCode    ierr;
5130d9fead3dSBarry Smith   const MatScalar   *aa=a->a,*v;
5131d9fead3dSBarry Smith   PetscScalar       *x,s1,s2,x1,x2;
5132d9fead3dSBarry Smith   const PetscScalar *b;
5133b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
513415091d37SBarry Smith 
513515091d37SBarry Smith   PetscFunctionBegin;
5136d9fead3dSBarry Smith   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
51371ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
513815091d37SBarry Smith 
513915091d37SBarry Smith   /* forward solve the lower triangular */
514015091d37SBarry Smith   idx    = 0;
514115091d37SBarry Smith   x[0]   = b[0]; x[1] = b[1];
514215091d37SBarry Smith   for (i=1; i<n; i++) {
514315091d37SBarry Smith     v     =  aa      + 4*ai[i];
514415091d37SBarry Smith     vi    =  aj      + ai[i];
514515091d37SBarry Smith     nz    =  diag[i] - ai[i];
514615091d37SBarry Smith     idx   +=  2;
5147f1af5d2fSBarry Smith     s1  =  b[idx];s2 = b[1+idx];
514815091d37SBarry Smith     while (nz--) {
514915091d37SBarry Smith       jdx   = 2*(*vi++);
515015091d37SBarry Smith       x1    = x[jdx];x2 = x[1+jdx];
5151f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5152f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
515315091d37SBarry Smith       v    += 4;
515415091d37SBarry Smith     }
5155f1af5d2fSBarry Smith     x[idx]   = s1;
5156f1af5d2fSBarry Smith     x[1+idx] = s2;
515715091d37SBarry Smith   }
515815091d37SBarry Smith   /* backward solve the upper triangular */
515915091d37SBarry Smith   for (i=n-1; i>=0; i--){
516015091d37SBarry Smith     v    = aa + 4*diag[i] + 4;
516115091d37SBarry Smith     vi   = aj + diag[i] + 1;
516215091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
516315091d37SBarry Smith     idt  = 2*i;
5164f1af5d2fSBarry Smith     s1 = x[idt];  s2 = x[1+idt];
516515091d37SBarry Smith     while (nz--) {
516615091d37SBarry Smith       idx   = 2*(*vi++);
516715091d37SBarry Smith       x1    = x[idx];   x2 = x[1+idx];
5168f1af5d2fSBarry Smith       s1 -= v[0]*x1 + v[2]*x2;
5169f1af5d2fSBarry Smith       s2 -= v[1]*x1 + v[3]*x2;
517015091d37SBarry Smith       v    += 4;
517115091d37SBarry Smith     }
517215091d37SBarry Smith     v        = aa +  4*diag[i];
5173f1af5d2fSBarry Smith     x[idt]   = v[0]*s1 + v[2]*s2;
5174f1af5d2fSBarry Smith     x[1+idt] = v[1]*s1 + v[3]*s2;
517515091d37SBarry Smith   }
517615091d37SBarry Smith 
5177d9fead3dSBarry Smith   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
51781ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5179dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
518015091d37SBarry Smith   PetscFunctionReturn(0);
518115091d37SBarry Smith }
518215091d37SBarry Smith 
5183cee9d6f2SShri Abhyankar #undef __FUNCT__
51844dd39f65SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_2_NaturalOrdering"
51854dd39f65SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_2_NaturalOrdering(Mat A,Vec bb,Vec xx)
5186b2b2dd24SShri Abhyankar {
5187b2b2dd24SShri Abhyankar     Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5188b3260449SShri Abhyankar     const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag=a->diag;
5189b3260449SShri Abhyankar     PetscInt          i,k,nz,idx,idt,jdx;
5190b2b2dd24SShri Abhyankar     PetscErrorCode    ierr;
5191b2b2dd24SShri Abhyankar     const MatScalar   *aa=a->a,*v;
5192b2b2dd24SShri Abhyankar     PetscScalar       *x,s1,s2,x1,x2;
5193b2b2dd24SShri Abhyankar     const PetscScalar *b;
5194b2b2dd24SShri Abhyankar 
5195b2b2dd24SShri Abhyankar     PetscFunctionBegin;
5196b2b2dd24SShri Abhyankar     ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5197b2b2dd24SShri Abhyankar     ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5198b2b2dd24SShri Abhyankar     /* forward solve the lower triangular */
5199b2b2dd24SShri Abhyankar     idx    = 0;
5200b2b2dd24SShri Abhyankar     x[0] = b[idx]; x[1] = b[1+idx];
5201b2b2dd24SShri Abhyankar     for (i=1; i<n; i++) {
5202b2b2dd24SShri Abhyankar         v   = aa + 4*ai[i];
5203b2b2dd24SShri Abhyankar        vi   = aj + ai[i];
5204b2b2dd24SShri Abhyankar        nz   = ai[i+1] - ai[i];
5205b2b2dd24SShri Abhyankar        idx  = 2*i;
5206b2b2dd24SShri Abhyankar        s1   = b[idx];s2 = b[1+idx];
5207b2b2dd24SShri Abhyankar       for(k=0;k<nz;k++){
5208b2b2dd24SShri Abhyankar          jdx   = 2*vi[k];
5209b2b2dd24SShri Abhyankar           x1    = x[jdx];x2 = x[1+jdx];
5210b2b2dd24SShri Abhyankar           s1   -= v[0]*x1 + v[2]*x2;
5211b2b2dd24SShri Abhyankar           s2   -= v[1]*x1 + v[3]*x2;
5212b2b2dd24SShri Abhyankar            v   +=  4;
5213b2b2dd24SShri Abhyankar         }
5214b2b2dd24SShri Abhyankar        x[idx]   = s1;
5215b2b2dd24SShri Abhyankar        x[1+idx] = s2;
5216b2b2dd24SShri Abhyankar     }
5217b2b2dd24SShri Abhyankar 
5218b2b2dd24SShri Abhyankar    /* backward solve the upper triangular */
5219b2b2dd24SShri Abhyankar   for (i=n-1; i>=0; i--){
5220b2b2dd24SShri Abhyankar      v   = aa + 4*(adiag[i+1]+1);
5221b2b2dd24SShri Abhyankar      vi  = aj + adiag[i+1]+1;
5222b2b2dd24SShri Abhyankar      nz  = adiag[i] - adiag[i+1]-1;
5223b2b2dd24SShri Abhyankar      idt = 2*i;
5224b2b2dd24SShri Abhyankar      s1 = x[idt];  s2 = x[1+idt];
5225b2b2dd24SShri Abhyankar      for(k=0;k<nz;k++){
5226b2b2dd24SShri Abhyankar       idx   = 2*vi[k];
5227b2b2dd24SShri Abhyankar        x1    = x[idx];   x2 = x[1+idx];
5228b2b2dd24SShri Abhyankar        s1 -= v[0]*x1 + v[2]*x2;
5229b2b2dd24SShri Abhyankar        s2 -= v[1]*x1 + v[3]*x2;
5230b2b2dd24SShri Abhyankar          v    += 4;
5231b2b2dd24SShri Abhyankar     }
5232b2b2dd24SShri Abhyankar     /* x = inv_diagonal*x */
5233b2b2dd24SShri Abhyankar    x[idt]   = v[0]*s1 + v[2]*s2;
5234b2b2dd24SShri Abhyankar    x[1+idt] = v[1]*s1 + v[3]*s2;
5235b2b2dd24SShri Abhyankar   }
5236b2b2dd24SShri Abhyankar 
5237b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5238b2b2dd24SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5239b2b2dd24SShri Abhyankar   ierr = PetscLogFlops(2.0*4*(a->nz) - 2.0*A->cmap->n);CHKERRQ(ierr);
5240b2b2dd24SShri Abhyankar   PetscFunctionReturn(0);
5241b2b2dd24SShri Abhyankar }
5242b2b2dd24SShri Abhyankar 
5243b2b2dd24SShri Abhyankar #undef __FUNCT__
524406e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_inplace"
524506e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_inplace(Mat A,Vec bb,Vec xx)
52464e2b4712SSatish Balay {
52474e2b4712SSatish Balay   Mat_SeqBAIJ       *a=(Mat_SeqBAIJ *)A->data;
52484e2b4712SSatish Balay   IS                iscol=a->col,isrow=a->row;
52496849ba73SBarry Smith   PetscErrorCode    ierr;
5250b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j;
5251b3260449SShri Abhyankar   PetscInt          i,nz;
52525d0c19d7SBarry Smith   const PetscInt    *r,*c,*diag = a->diag,*rout,*cout;
5253b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5254b3260449SShri Abhyankar   PetscScalar       *x,s1,*t;
5255b3260449SShri Abhyankar   const PetscScalar *b;
52564e2b4712SSatish Balay 
52574e2b4712SSatish Balay   PetscFunctionBegin;
52584e2b4712SSatish Balay   if (!n) PetscFunctionReturn(0);
52594e2b4712SSatish Balay 
5260b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
52611ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5262f1af5d2fSBarry Smith   t  = a->solve_work;
52634e2b4712SSatish Balay 
52644e2b4712SSatish Balay   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
52654e2b4712SSatish Balay   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout + (n-1);
52664e2b4712SSatish Balay 
52674e2b4712SSatish Balay   /* forward solve the lower triangular */
5268f1af5d2fSBarry Smith   t[0] = b[*r++];
52694e2b4712SSatish Balay   for (i=1; i<n; i++) {
52704e2b4712SSatish Balay     v     = aa + ai[i];
52714e2b4712SSatish Balay     vi    = aj + ai[i];
52724e2b4712SSatish Balay     nz    = diag[i] - ai[i];
5273f1af5d2fSBarry Smith     s1  = b[*r++];
52744e2b4712SSatish Balay     while (nz--) {
5275f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
52764e2b4712SSatish Balay     }
5277f1af5d2fSBarry Smith     t[i] = s1;
52784e2b4712SSatish Balay   }
52794e2b4712SSatish Balay   /* backward solve the upper triangular */
52804e2b4712SSatish Balay   for (i=n-1; i>=0; i--){
52814e2b4712SSatish Balay     v    = aa + diag[i] + 1;
52824e2b4712SSatish Balay     vi   = aj + diag[i] + 1;
52834e2b4712SSatish Balay     nz   = ai[i+1] - diag[i] - 1;
5284f1af5d2fSBarry Smith     s1 = t[i];
52854e2b4712SSatish Balay     while (nz--) {
5286f1af5d2fSBarry Smith       s1 -= (*v++)*t[*vi++];
52874e2b4712SSatish Balay     }
5288f1af5d2fSBarry Smith     x[*c--] = t[i] = aa[diag[i]]*s1;
52894e2b4712SSatish Balay   }
52904e2b4712SSatish Balay 
52914e2b4712SSatish Balay   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
52924e2b4712SSatish Balay   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5293b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
52941ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5295dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*1*(a->nz) - A->cmap->n);CHKERRQ(ierr);
52964e2b4712SSatish Balay   PetscFunctionReturn(0);
52974e2b4712SSatish Balay }
5298048b5e81SShri Abhyankar 
5299048b5e81SShri Abhyankar #undef __FUNCT__
5300048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1"
5301048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1(Mat A,Vec bb,Vec xx)
5302048b5e81SShri Abhyankar {
5303048b5e81SShri Abhyankar   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data;
5304048b5e81SShri Abhyankar   IS                iscol = a->col,isrow = a->row;
5305048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5306048b5e81SShri Abhyankar   PetscInt          i,n=a->mbs,*vi,*ai=a->i,*aj=a->j,*adiag = a->diag,nz;
5307048b5e81SShri Abhyankar   const PetscInt    *rout,*cout,*r,*c;
5308048b5e81SShri Abhyankar   PetscScalar       *x,*tmp,sum;
5309048b5e81SShri Abhyankar   const PetscScalar *b;
5310048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5311048b5e81SShri Abhyankar 
5312048b5e81SShri Abhyankar   PetscFunctionBegin;
5313048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5314048b5e81SShri Abhyankar 
5315048b5e81SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5316048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5317048b5e81SShri Abhyankar   tmp  = a->solve_work;
5318048b5e81SShri Abhyankar 
5319048b5e81SShri Abhyankar   ierr = ISGetIndices(isrow,&rout);CHKERRQ(ierr); r = rout;
5320048b5e81SShri Abhyankar   ierr = ISGetIndices(iscol,&cout);CHKERRQ(ierr); c = cout;
5321048b5e81SShri Abhyankar 
5322048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5323048b5e81SShri Abhyankar   tmp[0] = b[r[0]];
5324048b5e81SShri Abhyankar   v      = aa;
5325048b5e81SShri Abhyankar   vi     = aj;
5326048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5327048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5328048b5e81SShri Abhyankar     sum = b[r[i]];
5329048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5330048b5e81SShri Abhyankar     tmp[i] = sum;
5331048b5e81SShri Abhyankar     v += nz; vi += nz;
5332048b5e81SShri Abhyankar   }
5333048b5e81SShri Abhyankar 
5334048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5335048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5336048b5e81SShri Abhyankar     v   = aa + adiag[i+1]+1;
5337048b5e81SShri Abhyankar     vi  = aj + adiag[i+1]+1;
5338048b5e81SShri Abhyankar     nz  = adiag[i]-adiag[i+1]-1;
5339048b5e81SShri Abhyankar     sum = tmp[i];
5340048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,tmp,v,vi,nz);
5341048b5e81SShri Abhyankar     x[c[i]] = tmp[i] = sum*v[nz]; /* v[nz] = aa[adiag[i]] */
5342048b5e81SShri Abhyankar   }
5343048b5e81SShri Abhyankar 
5344048b5e81SShri Abhyankar   ierr = ISRestoreIndices(isrow,&rout);CHKERRQ(ierr);
5345048b5e81SShri Abhyankar   ierr = ISRestoreIndices(iscol,&cout);CHKERRQ(ierr);
5346048b5e81SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5347048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5348048b5e81SShri Abhyankar   ierr = PetscLogFlops(2*a->nz - A->cmap->n);CHKERRQ(ierr);
5349048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5350048b5e81SShri Abhyankar }
5351048b5e81SShri Abhyankar 
535215091d37SBarry Smith /*
535315091d37SBarry Smith       Special case where the matrix was ILU(0) factored in the natural
535415091d37SBarry Smith    ordering. This eliminates the need for the column and row permutation.
535515091d37SBarry Smith */
53564a2ae208SSatish Balay #undef __FUNCT__
535706e38f1dSHong Zhang #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering_inplace"
535806e38f1dSHong Zhang PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering_inplace(Mat A,Vec bb,Vec xx)
535915091d37SBarry Smith {
536015091d37SBarry Smith   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ *)A->data;
5361b3260449SShri Abhyankar   const PetscInt    n=a->mbs,*vi,*ai=a->i,*aj=a->j,*diag=a->diag;
5362dfbe8321SBarry Smith   PetscErrorCode    ierr;
5363b3260449SShri Abhyankar   const MatScalar   *aa=a->a,*v;
5364b3260449SShri Abhyankar   PetscScalar       *x;
5365b3260449SShri Abhyankar   const PetscScalar *b;
536687828ca2SBarry Smith   PetscScalar       s1,x1;
5367b3260449SShri Abhyankar   PetscInt          jdx,idt,idx,nz,i;
536815091d37SBarry Smith 
536915091d37SBarry Smith   PetscFunctionBegin;
5370b3260449SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
53711ebc52fbSHong Zhang   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
537215091d37SBarry Smith 
537315091d37SBarry Smith   /* forward solve the lower triangular */
537415091d37SBarry Smith   idx    = 0;
537515091d37SBarry Smith   x[0]   = b[0];
537615091d37SBarry Smith   for (i=1; i<n; i++) {
537715091d37SBarry Smith     v     =  aa      + ai[i];
537815091d37SBarry Smith     vi    =  aj      + ai[i];
537915091d37SBarry Smith     nz    =  diag[i] - ai[i];
538015091d37SBarry Smith     idx   +=  1;
5381f1af5d2fSBarry Smith     s1  =  b[idx];
538215091d37SBarry Smith     while (nz--) {
538315091d37SBarry Smith       jdx   = *vi++;
538415091d37SBarry Smith       x1    = x[jdx];
5385f1af5d2fSBarry Smith       s1 -= v[0]*x1;
538615091d37SBarry Smith       v    += 1;
538715091d37SBarry Smith     }
5388f1af5d2fSBarry Smith     x[idx]   = s1;
538915091d37SBarry Smith   }
539015091d37SBarry Smith   /* backward solve the upper triangular */
539115091d37SBarry Smith   for (i=n-1; i>=0; i--){
539215091d37SBarry Smith     v    = aa + diag[i] + 1;
539315091d37SBarry Smith     vi   = aj + diag[i] + 1;
539415091d37SBarry Smith     nz   = ai[i+1] - diag[i] - 1;
539515091d37SBarry Smith     idt  = i;
5396f1af5d2fSBarry Smith     s1 = x[idt];
539715091d37SBarry Smith     while (nz--) {
539815091d37SBarry Smith       idx   = *vi++;
539915091d37SBarry Smith       x1    = x[idx];
5400f1af5d2fSBarry Smith       s1 -= v[0]*x1;
540115091d37SBarry Smith       v    += 1;
540215091d37SBarry Smith     }
540315091d37SBarry Smith     v        = aa +  diag[i];
5404f1af5d2fSBarry Smith     x[idt]   = v[0]*s1;
540515091d37SBarry Smith   }
5406b3260449SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
54071ebc52fbSHong Zhang   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5408dc0b31edSSatish Balay   ierr = PetscLogFlops(2.0*(a->nz) - A->cmap->n);CHKERRQ(ierr);
540915091d37SBarry Smith   PetscFunctionReturn(0);
541015091d37SBarry Smith }
54114e2b4712SSatish Balay 
5412048b5e81SShri Abhyankar 
5413048b5e81SShri Abhyankar #undef __FUNCT__
5414048b5e81SShri Abhyankar #define __FUNCT__ "MatSolve_SeqBAIJ_1_NaturalOrdering"
5415048b5e81SShri Abhyankar PetscErrorCode MatSolve_SeqBAIJ_1_NaturalOrdering(Mat A,Vec bb,Vec xx)
5416048b5e81SShri Abhyankar {
5417048b5e81SShri Abhyankar   Mat_SeqBAIJ       *a = (Mat_SeqBAIJ*)A->data;
5418048b5e81SShri Abhyankar   PetscErrorCode    ierr;
5419048b5e81SShri Abhyankar   const PetscInt    n = a->mbs,*ai = a->i,*aj = a->j,*adiag = a->diag,*vi;
5420048b5e81SShri Abhyankar   PetscScalar       *x,sum;
5421048b5e81SShri Abhyankar   const PetscScalar *b;
5422048b5e81SShri Abhyankar   const MatScalar   *aa = a->a,*v;
5423048b5e81SShri Abhyankar   PetscInt          i,nz;
5424048b5e81SShri Abhyankar 
5425048b5e81SShri Abhyankar   PetscFunctionBegin;
5426048b5e81SShri Abhyankar   if (!n) PetscFunctionReturn(0);
5427048b5e81SShri Abhyankar 
5428048b5e81SShri Abhyankar   ierr = VecGetArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5429048b5e81SShri Abhyankar   ierr = VecGetArray(xx,&x);CHKERRQ(ierr);
5430048b5e81SShri Abhyankar 
5431048b5e81SShri Abhyankar   /* forward solve the lower triangular */
5432048b5e81SShri Abhyankar   x[0] = b[0];
5433048b5e81SShri Abhyankar   v    = aa;
5434048b5e81SShri Abhyankar   vi   = aj;
5435048b5e81SShri Abhyankar   for (i=1; i<n; i++) {
5436048b5e81SShri Abhyankar     nz  = ai[i+1] - ai[i];
5437048b5e81SShri Abhyankar     sum = b[i];
5438048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5439048b5e81SShri Abhyankar     v  += nz;
5440048b5e81SShri Abhyankar     vi += nz;
5441048b5e81SShri Abhyankar     x[i] = sum;
5442048b5e81SShri Abhyankar   }
5443048b5e81SShri Abhyankar 
5444048b5e81SShri Abhyankar   /* backward solve the upper triangular */
5445048b5e81SShri Abhyankar   for (i=n-1; i>=0; i--){
5446048b5e81SShri Abhyankar     v   = aa + adiag[i+1] + 1;
5447048b5e81SShri Abhyankar     vi  = aj + adiag[i+1] + 1;
5448048b5e81SShri Abhyankar     nz = adiag[i] - adiag[i+1]-1;
5449048b5e81SShri Abhyankar     sum = x[i];
5450048b5e81SShri Abhyankar     PetscSparseDenseMinusDot(sum,x,v,vi,nz);
5451048b5e81SShri Abhyankar     x[i] = sum*v[nz]; /* x[i]=aa[adiag[i]]*sum; v++; */
5452048b5e81SShri Abhyankar   }
5453048b5e81SShri Abhyankar 
5454048b5e81SShri Abhyankar   ierr = PetscLogFlops(2.0*a->nz - A->cmap->n);CHKERRQ(ierr);
5455048b5e81SShri Abhyankar   ierr = VecRestoreArray(bb,(PetscScalar**)&b);CHKERRQ(ierr);
5456048b5e81SShri Abhyankar   ierr = VecRestoreArray(xx,&x);CHKERRQ(ierr);
5457048b5e81SShri Abhyankar   PetscFunctionReturn(0);
5458048b5e81SShri Abhyankar }
5459048b5e81SShri Abhyankar 
54604e2b4712SSatish Balay /* ----------------------------------------------------------------*/
546116a2bf60SHong Zhang EXTERN PetscErrorCode MatDuplicateNoCreate_SeqBAIJ(Mat,Mat,MatDuplicateOption,PetscTruth);
54626bce7ff8SHong Zhang 
54632b0b2ea7SShri Abhyankar #undef __FUNCT__
546429a97285SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering"
5465766f9fbaSBarry Smith /*
5466766f9fbaSBarry Smith    This is not much faster than MatLUFactorNumeric_SeqBAIJ_N() but the solve is faster at least sometimes
5467766f9fbaSBarry Smith */
546829a97285SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_15_NaturalOrdering(Mat B,Mat A,const MatFactorInfo *info)
54692b0b2ea7SShri Abhyankar {
54702b0b2ea7SShri Abhyankar   Mat             C=B;
54712b0b2ea7SShri Abhyankar   Mat_SeqBAIJ     *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
54722b0b2ea7SShri Abhyankar   PetscErrorCode  ierr;
5473766f9fbaSBarry Smith   PetscInt        i,j,k,ipvt[15];
5474766f9fbaSBarry Smith   const PetscInt  n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j,*ajtmp,*bjtmp,*bdiag=b->diag,*pj;
5475766f9fbaSBarry Smith   PetscInt        nz,nzL,row;
5476766f9fbaSBarry Smith   MatScalar       *rtmp,*pc,*mwork,*pv,*vv,work[225];
5477766f9fbaSBarry Smith   const MatScalar *v,*aa=a->a;
54782b0b2ea7SShri Abhyankar   PetscInt        bs2 = a->bs2,bs=A->rmap->bs,flg;
54790fa040f9SShri Abhyankar   PetscInt        sol_ver;
54802b0b2ea7SShri Abhyankar 
54812b0b2ea7SShri Abhyankar   PetscFunctionBegin;
54822b0b2ea7SShri Abhyankar 
54830fa040f9SShri Abhyankar   ierr = PetscOptionsGetInt(PETSC_NULL,"-sol_ver",&sol_ver,PETSC_NULL);CHKERRQ(ierr);
54840fa040f9SShri Abhyankar 
54852b0b2ea7SShri Abhyankar   /* generate work space needed by the factorization */
54862b0b2ea7SShri Abhyankar   ierr = PetscMalloc2(bs2*n,MatScalar,&rtmp,bs2,MatScalar,&mwork);CHKERRQ(ierr);
54872b0b2ea7SShri Abhyankar   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
54882b0b2ea7SShri Abhyankar 
54892b0b2ea7SShri Abhyankar   for (i=0; i<n; i++){
54902b0b2ea7SShri Abhyankar     /* zero rtmp */
54912b0b2ea7SShri Abhyankar     /* L part */
54922b0b2ea7SShri Abhyankar     nz    = bi[i+1] - bi[i];
54932b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
54942b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
54952b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
54962b0b2ea7SShri Abhyankar     }
54972b0b2ea7SShri Abhyankar 
54982b0b2ea7SShri Abhyankar     /* U part */
54992b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
55002b0b2ea7SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
55012b0b2ea7SShri Abhyankar     for  (j=0; j<nz; j++){
55022b0b2ea7SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55032b0b2ea7SShri Abhyankar     }
55042b0b2ea7SShri Abhyankar 
55052b0b2ea7SShri Abhyankar     /* load in initial (unfactored row) */
550629a97285SShri Abhyankar     nz    = ai[i+1] - ai[i];
550729a97285SShri Abhyankar     ajtmp = aj + ai[i];
550829a97285SShri Abhyankar     v     = aa + bs2*ai[i];
55092b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
551029a97285SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ajtmp[j],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
55112b0b2ea7SShri Abhyankar     }
55122b0b2ea7SShri Abhyankar 
55132b0b2ea7SShri Abhyankar     /* elimination */
55142b0b2ea7SShri Abhyankar     bjtmp = bj + bi[i];
55152b0b2ea7SShri Abhyankar     nzL   = bi[i+1] - bi[i];
55162b0b2ea7SShri Abhyankar     for(k=0;k < nzL;k++) {
55172b0b2ea7SShri Abhyankar       row = bjtmp[k];
55182b0b2ea7SShri Abhyankar       pc = rtmp + bs2*row;
55192b0b2ea7SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
55202b0b2ea7SShri Abhyankar       if (flg) {
55212b0b2ea7SShri Abhyankar         pv = b->a + bs2*bdiag[row];
5522766f9fbaSBarry Smith 	Kernel_A_gets_A_times_B(bs,pc,pv,mwork);
5523766f9fbaSBarry Smith 	/*ierr = Kernel_A_gets_A_times_B_15(pc,pv,mwork);CHKERRQ(ierr);*/
55242b0b2ea7SShri Abhyankar 	pj = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
55252b0b2ea7SShri Abhyankar         pv = b->a + bs2*(bdiag[row+1]+1);
55262b0b2ea7SShri Abhyankar         nz = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
55272b0b2ea7SShri Abhyankar         for (j=0; j<nz; j++) {
5528766f9fbaSBarry Smith           vv   = rtmp + bs2*pj[j];
5529766f9fbaSBarry Smith           Kernel_A_gets_A_minus_B_times_C(bs,vv,pc,pv);
5530766f9fbaSBarry Smith 	  /* ierr = Kernel_A_gets_A_minus_B_times_C_15(vv,pc,pv);CHKERRQ(ierr); */
55312b0b2ea7SShri Abhyankar 	  pv  += bs2;
55322b0b2ea7SShri Abhyankar         }
5533766f9fbaSBarry Smith         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
55342b0b2ea7SShri Abhyankar       }
55352b0b2ea7SShri Abhyankar     }
55362b0b2ea7SShri Abhyankar 
55372b0b2ea7SShri Abhyankar     /* finished row so stick it into b->a */
55382b0b2ea7SShri Abhyankar     /* L part */
55392b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
55402b0b2ea7SShri Abhyankar     pj   = b->j + bi[i] ;
55412b0b2ea7SShri Abhyankar     nz   = bi[i+1] - bi[i];
55422b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++) {
55432b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55442b0b2ea7SShri Abhyankar     }
55452b0b2ea7SShri Abhyankar 
55462b0b2ea7SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
55472b0b2ea7SShri Abhyankar     pv   = b->a + bs2*bdiag[i];
55482b0b2ea7SShri Abhyankar     pj   = b->j + bdiag[i];
55492b0b2ea7SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5550766f9fbaSBarry Smith     /* Kernel_A_gets_inverse_A(bs,pv,pivots,work); */
5551182b8fbaSHong Zhang     ierr = Kernel_A_gets_inverse_A_15(pv,ipvt,work,info->shiftamount);CHKERRQ(ierr);
55522b0b2ea7SShri Abhyankar 
55532b0b2ea7SShri Abhyankar     /* U part */
55542b0b2ea7SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
55552b0b2ea7SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
55562b0b2ea7SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
55572b0b2ea7SShri Abhyankar     for (j=0; j<nz; j++){
55582b0b2ea7SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
55592b0b2ea7SShri Abhyankar     }
55602b0b2ea7SShri Abhyankar   }
55612b0b2ea7SShri Abhyankar 
55622b0b2ea7SShri Abhyankar   ierr = PetscFree2(rtmp,mwork);CHKERRQ(ierr);
5563832cc040SShri Abhyankar   C->ops->solve = MatSolve_SeqBAIJ_15_NaturalOrdering_ver1;
5564766f9fbaSBarry Smith   C->ops->solvetranspose = MatSolve_SeqBAIJ_N_NaturalOrdering;
55652b0b2ea7SShri Abhyankar   C->assembled = PETSC_TRUE;
5566766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
55672b0b2ea7SShri Abhyankar   PetscFunctionReturn(0);
55682b0b2ea7SShri Abhyankar }
55692b0b2ea7SShri Abhyankar 
55706bce7ff8SHong Zhang #undef __FUNCT__
55714dd39f65SShri Abhyankar #define __FUNCT__ "MatLUFactorNumeric_SeqBAIJ_N"
55724dd39f65SShri Abhyankar PetscErrorCode MatLUFactorNumeric_SeqBAIJ_N(Mat B,Mat A,const MatFactorInfo *info)
55736bce7ff8SHong Zhang {
55746bce7ff8SHong Zhang   Mat            C=B;
55756bce7ff8SHong Zhang   Mat_SeqBAIJ    *a=(Mat_SeqBAIJ*)A->data,*b=(Mat_SeqBAIJ *)C->data;
55766bce7ff8SHong Zhang   IS             isrow = b->row,isicol = b->icol;
55776bce7ff8SHong Zhang   PetscErrorCode ierr;
55786bce7ff8SHong Zhang   const PetscInt *r,*ic,*ics;
55796bce7ff8SHong Zhang   PetscInt       i,j,k,n=a->mbs,*ai=a->i,*aj=a->j,*bi=b->i,*bj=b->j;
55806bce7ff8SHong Zhang   PetscInt       *ajtmp,*bjtmp,nz,nzL,row,*bdiag=b->diag,*pj;
5581b588c5a2SHong Zhang   MatScalar      *rtmp,*pc,*mwork,*v,*pv,*aa=a->a;
5582914a18a2SHong Zhang   PetscInt       bs=A->rmap->bs,bs2 = a->bs2,*v_pivots,flg;
5583914a18a2SHong Zhang   MatScalar      *v_work;
5584ae3d28f0SHong Zhang   PetscTruth     col_identity,row_identity,both_identity;
55856bce7ff8SHong Zhang 
55866bce7ff8SHong Zhang   PetscFunctionBegin;
55876bce7ff8SHong Zhang   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
55886bce7ff8SHong Zhang   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
5589ae3d28f0SHong Zhang 
5590fca92195SBarry Smith   ierr = PetscMalloc(bs2*n*sizeof(MatScalar),&rtmp);CHKERRQ(ierr);
5591fca92195SBarry Smith   ierr = PetscMemzero(rtmp,bs2*n*sizeof(MatScalar));CHKERRQ(ierr);
55926bce7ff8SHong Zhang   ics  = ic;
55936bce7ff8SHong Zhang 
5594914a18a2SHong Zhang   /* generate work space needed by dense LU factorization */
5595fca92195SBarry Smith   ierr  = PetscMalloc3(bs,MatScalar,&v_work,bs2,MatScalar,&mwork,bs,PetscInt,&v_pivots);CHKERRQ(ierr);
5596914a18a2SHong Zhang 
55976bce7ff8SHong Zhang   for (i=0; i<n; i++){
55986bce7ff8SHong Zhang     /* zero rtmp */
55996bce7ff8SHong Zhang     /* L part */
56006bce7ff8SHong Zhang     nz    = bi[i+1] - bi[i];
56016bce7ff8SHong Zhang     bjtmp = bj + bi[i];
5602914a18a2SHong Zhang     for  (j=0; j<nz; j++){
5603914a18a2SHong Zhang       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
5604914a18a2SHong Zhang     }
56056bce7ff8SHong Zhang 
56066bce7ff8SHong Zhang     /* U part */
56071a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1];
56081a83e813SShri Abhyankar     bjtmp = bj + bdiag[i+1]+1;
56091a83e813SShri Abhyankar     for  (j=0; j<nz; j++){
56101a83e813SShri Abhyankar       ierr = PetscMemzero(rtmp+bs2*bjtmp[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56111a83e813SShri Abhyankar     }
56121a83e813SShri Abhyankar 
56131a83e813SShri Abhyankar     /* load in initial (unfactored row) */
56141a83e813SShri Abhyankar     nz    = ai[r[i]+1] - ai[r[i]];
56151a83e813SShri Abhyankar     ajtmp = aj + ai[r[i]];
56161a83e813SShri Abhyankar     v     = aa + bs2*ai[r[i]];
56171a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
56181a83e813SShri Abhyankar       ierr = PetscMemcpy(rtmp+bs2*ic[ajtmp[j]],v+bs2*j,bs2*sizeof(MatScalar));CHKERRQ(ierr);
56191a83e813SShri Abhyankar     }
56201a83e813SShri Abhyankar 
56211a83e813SShri Abhyankar     /* elimination */
56221a83e813SShri Abhyankar     bjtmp = bj + bi[i];
56231a83e813SShri Abhyankar     nzL   = bi[i+1] - bi[i];
56241a83e813SShri Abhyankar     for(k=0;k < nzL;k++) {
56251a83e813SShri Abhyankar       row = bjtmp[k];
56261a83e813SShri Abhyankar       pc = rtmp + bs2*row;
56271a83e813SShri Abhyankar       for (flg=0,j=0; j<bs2; j++) { if (pc[j]!=0.0) { flg = 1; break; }}
56281a83e813SShri Abhyankar       if (flg) {
56291a83e813SShri Abhyankar         pv         = b->a + bs2*bdiag[row];
56301a83e813SShri Abhyankar         Kernel_A_gets_A_times_B(bs,pc,pv,mwork); /* *pc = *pc * (*pv); */
56311a83e813SShri Abhyankar         pj         = b->j + bdiag[row+1]+1; /* begining of U(row,:) */
56321a83e813SShri Abhyankar         pv         = b->a + bs2*(bdiag[row+1]+1);
56331a83e813SShri Abhyankar         nz         = bdiag[row] - bdiag[row+1] - 1; /* num of entries inU(row,:), excluding diag */
56341a83e813SShri Abhyankar         for (j=0; j<nz; j++) {
56351a83e813SShri Abhyankar           Kernel_A_gets_A_minus_B_times_C(bs,rtmp+bs2*pj[j],pc,pv+bs2*j);
56361a83e813SShri Abhyankar         }
56371a83e813SShri Abhyankar         ierr = PetscLogFlops(2*bs2*bs*(nz+1)-bs2);CHKERRQ(ierr); /* flops = 2*bs^3*nz + 2*bs^3 - bs2) */
56381a83e813SShri Abhyankar       }
56391a83e813SShri Abhyankar     }
56401a83e813SShri Abhyankar 
56411a83e813SShri Abhyankar     /* finished row so stick it into b->a */
56421a83e813SShri Abhyankar     /* L part */
56431a83e813SShri Abhyankar     pv   = b->a + bs2*bi[i] ;
56441a83e813SShri Abhyankar     pj   = b->j + bi[i] ;
56451a83e813SShri Abhyankar     nz   = bi[i+1] - bi[i];
56461a83e813SShri Abhyankar     for (j=0; j<nz; j++) {
56471a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56481a83e813SShri Abhyankar     }
56491a83e813SShri Abhyankar 
56501a83e813SShri Abhyankar     /* Mark diagonal and invert diagonal for simplier triangular solves */
56511a83e813SShri Abhyankar     pv  = b->a + bs2*bdiag[i];
56521a83e813SShri Abhyankar     pj  = b->j + bdiag[i];
5653e32f2f54SBarry Smith     /* if (*pj != i)SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_SUP,"row %d != *pj %d",i,*pj); */
56541a83e813SShri Abhyankar     ierr = PetscMemcpy(pv,rtmp+bs2*pj[0],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56551a83e813SShri Abhyankar     ierr = Kernel_A_gets_inverse_A(bs,pv,v_pivots,v_work);CHKERRQ(ierr);
56561a83e813SShri Abhyankar 
56571a83e813SShri Abhyankar     /* U part */
56581a83e813SShri Abhyankar     pv = b->a + bs2*(bdiag[i+1]+1);
56591a83e813SShri Abhyankar     pj = b->j + bdiag[i+1]+1;
56601a83e813SShri Abhyankar     nz = bdiag[i] - bdiag[i+1] - 1;
56611a83e813SShri Abhyankar     for (j=0; j<nz; j++){
56621a83e813SShri Abhyankar       ierr = PetscMemcpy(pv+bs2*j,rtmp+bs2*pj[j],bs2*sizeof(MatScalar));CHKERRQ(ierr);
56631a83e813SShri Abhyankar     }
56641a83e813SShri Abhyankar   }
56651a83e813SShri Abhyankar 
56661a83e813SShri Abhyankar   ierr = PetscFree(rtmp);CHKERRQ(ierr);
5667fca92195SBarry Smith   ierr = PetscFree3(v_work,mwork,v_pivots);CHKERRQ(ierr);
56681a83e813SShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
56691a83e813SShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
56701a83e813SShri Abhyankar 
5671ae3d28f0SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5672ae3d28f0SHong Zhang   ierr = ISIdentity(isicol,&col_identity);CHKERRQ(ierr);
5673ae3d28f0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
5674ae3d28f0SHong Zhang   if (both_identity){
56754dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N_NaturalOrdering;
5676ae3d28f0SHong Zhang   } else {
56774dd39f65SShri Abhyankar     C->ops->solve = MatSolve_SeqBAIJ_N;
5678ae3d28f0SHong Zhang   }
56794dd39f65SShri Abhyankar   C->ops->solvetranspose = MatSolveTranspose_SeqBAIJ_N;
5680ae3d28f0SHong Zhang 
56811a83e813SShri Abhyankar   C->assembled = PETSC_TRUE;
5682766f9fbaSBarry Smith   ierr = PetscLogFlops(1.333333333333*bs*bs2*b->mbs);CHKERRQ(ierr); /* from inverting diagonal blocks */
56831a83e813SShri Abhyankar   PetscFunctionReturn(0);
56841a83e813SShri Abhyankar }
56851a83e813SShri Abhyankar 
56866bce7ff8SHong Zhang /*
56876bce7ff8SHong Zhang    ilu(0) with natural ordering under new data structure.
56884dd39f65SShri Abhyankar    See MatILUFactorSymbolic_SeqAIJ_ilu0() for detailed description
56894dd39f65SShri Abhyankar    because this code is almost identical to MatILUFactorSymbolic_SeqAIJ_ilu0_inplace().
56906bce7ff8SHong Zhang */
5691c0c7eb62SShri Abhyankar 
56926bce7ff8SHong Zhang #undef __FUNCT__
56934dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_ilu0"
56944dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_ilu0(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
56956bce7ff8SHong Zhang {
56966bce7ff8SHong Zhang 
56976bce7ff8SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
56986bce7ff8SHong Zhang   PetscErrorCode     ierr;
569916a2bf60SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj,*adiag=a->diag,bs2 = a->bs2;
570035aa4fcfSShri Abhyankar   PetscInt           i,j,nz,*bi,*bj,*bdiag,bi_temp;
570135aa4fcfSShri Abhyankar 
570235aa4fcfSShri Abhyankar   PetscFunctionBegin;
570335aa4fcfSShri Abhyankar   ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_FALSE);CHKERRQ(ierr);
570435aa4fcfSShri Abhyankar   b    = (Mat_SeqBAIJ*)(fact)->data;
570535aa4fcfSShri Abhyankar 
570635aa4fcfSShri Abhyankar   /* allocate matrix arrays for new data structure */
570735aa4fcfSShri Abhyankar   ierr = PetscMalloc3(bs2*ai[n]+1,PetscScalar,&b->a,ai[n]+1,PetscInt,&b->j,n+1,PetscInt,&b->i);CHKERRQ(ierr);
570835aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,ai[n]*(bs2*sizeof(PetscScalar)+sizeof(PetscInt))+(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
570935aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_TRUE;
571035aa4fcfSShri Abhyankar   if (!b->diag){
571135aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&b->diag);CHKERRQ(ierr);
571235aa4fcfSShri Abhyankar     ierr = PetscLogObjectMemory(fact,(n+1)*sizeof(PetscInt));CHKERRQ(ierr);
571335aa4fcfSShri Abhyankar   }
571435aa4fcfSShri Abhyankar   bdiag = b->diag;
571535aa4fcfSShri Abhyankar 
571635aa4fcfSShri Abhyankar   if (n > 0) {
571735aa4fcfSShri Abhyankar     ierr = PetscMemzero(b->a,bs2*ai[n]*sizeof(MatScalar));CHKERRQ(ierr);
571835aa4fcfSShri Abhyankar   }
571935aa4fcfSShri Abhyankar 
572035aa4fcfSShri Abhyankar   /* set bi and bj with new data structure */
572135aa4fcfSShri Abhyankar   bi = b->i;
572235aa4fcfSShri Abhyankar   bj = b->j;
572335aa4fcfSShri Abhyankar 
572435aa4fcfSShri Abhyankar   /* L part */
572535aa4fcfSShri Abhyankar   bi[0] = 0;
572635aa4fcfSShri Abhyankar   for (i=0; i<n; i++){
572735aa4fcfSShri Abhyankar     nz = adiag[i] - ai[i];
572835aa4fcfSShri Abhyankar     bi[i+1] = bi[i] + nz;
572935aa4fcfSShri Abhyankar     aj = a->j + ai[i];
573035aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
573135aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
573235aa4fcfSShri Abhyankar     }
573335aa4fcfSShri Abhyankar   }
573435aa4fcfSShri Abhyankar 
573535aa4fcfSShri Abhyankar   /* U part */
573635aa4fcfSShri Abhyankar   bi_temp = bi[n];
573735aa4fcfSShri Abhyankar   bdiag[n] = bi[n]-1;
573835aa4fcfSShri Abhyankar   for (i=n-1; i>=0; i--){
573935aa4fcfSShri Abhyankar     nz = ai[i+1] - adiag[i] - 1;
574035aa4fcfSShri Abhyankar     bi_temp = bi_temp + nz + 1;
574135aa4fcfSShri Abhyankar     aj = a->j + adiag[i] + 1;
574235aa4fcfSShri Abhyankar     for (j=0; j<nz; j++){
574335aa4fcfSShri Abhyankar       *bj = aj[j]; bj++;
574435aa4fcfSShri Abhyankar     }
574535aa4fcfSShri Abhyankar     /* diag[i] */
574635aa4fcfSShri Abhyankar     *bj = i; bj++;
574735aa4fcfSShri Abhyankar     bdiag[i] = bi_temp - 1;
574835aa4fcfSShri Abhyankar   }
574935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
575035aa4fcfSShri Abhyankar }
575135aa4fcfSShri Abhyankar 
575235aa4fcfSShri Abhyankar #undef __FUNCT__
57534dd39f65SShri Abhyankar #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ"
57544dd39f65SShri Abhyankar PetscErrorCode MatILUFactorSymbolic_SeqBAIJ(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
575516a2bf60SHong Zhang {
575616a2bf60SHong Zhang   Mat_SeqBAIJ        *a = (Mat_SeqBAIJ*)A->data,*b;
575716a2bf60SHong Zhang   IS                 isicol;
575816a2bf60SHong Zhang   PetscErrorCode     ierr;
575916a2bf60SHong Zhang   const PetscInt     *r,*ic;
57607fa3a6a0SHong Zhang   PetscInt           n=a->mbs,*ai=a->i,*aj=a->j,d;
576116a2bf60SHong Zhang   PetscInt           *bi,*cols,nnz,*cols_lvl;
576216a2bf60SHong Zhang   PetscInt           *bdiag,prow,fm,nzbd,reallocs=0,dcount=0;
576316a2bf60SHong Zhang   PetscInt           i,levels,diagonal_fill;
57647fa3a6a0SHong Zhang   PetscTruth         col_identity,row_identity,both_identity;
576516a2bf60SHong Zhang   PetscReal          f;
576616a2bf60SHong Zhang   PetscInt           nlnk,*lnk,*lnk_lvl=PETSC_NULL;
576716a2bf60SHong Zhang   PetscBT            lnkbt;
576816a2bf60SHong Zhang   PetscInt           nzi,*bj,**bj_ptr,**bjlvl_ptr;
576916a2bf60SHong Zhang   PetscFreeSpaceList free_space=PETSC_NULL,current_space=PETSC_NULL;
577016a2bf60SHong Zhang   PetscFreeSpaceList free_space_lvl=PETSC_NULL,current_space_lvl=PETSC_NULL;
577116a2bf60SHong Zhang   PetscTruth         missing;
57727fa3a6a0SHong Zhang   PetscInt           bs=A->rmap->bs,bs2=a->bs2;
577316a2bf60SHong Zhang 
577416a2bf60SHong Zhang   PetscFunctionBegin;
5775e32f2f54SBarry Smith   if (A->rmap->n != A->cmap->n) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Must be square matrix, rows %D columns %D",A->rmap->n,A->cmap->n);
5776*6ba06ab7SHong Zhang   if (bs>1){  /* check shifttype */
5777*6ba06ab7SHong Zhang     if (info->shifttype == MAT_SHIFT_NONZERO || info->shifttype == MAT_SHIFT_POSITIVE_DEFINITE)
5778*6ba06ab7SHong Zhang       SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Only MAT_SHIFT_NONE and MAT_SHIFT_INBLOCKS are supported for BAIJ matrix");
5779*6ba06ab7SHong Zhang   }
5780*6ba06ab7SHong Zhang 
578116a2bf60SHong Zhang   ierr = MatMissingDiagonal(A,&missing,&d);CHKERRQ(ierr);
5782e32f2f54SBarry Smith   if (missing) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix is missing diagonal entry %D",d);
578316a2bf60SHong Zhang 
578416a2bf60SHong Zhang   f             = info->fill;
578516a2bf60SHong Zhang   levels        = (PetscInt)info->levels;
578616a2bf60SHong Zhang   diagonal_fill = (PetscInt)info->diagonal_fill;
578716a2bf60SHong Zhang   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
578816a2bf60SHong Zhang 
578916a2bf60SHong Zhang   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
579016a2bf60SHong Zhang   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
57917fa3a6a0SHong Zhang   both_identity = (PetscTruth) (row_identity && col_identity);
579216a2bf60SHong Zhang 
57937fa3a6a0SHong Zhang   if (!levels && both_identity) {
579416a2bf60SHong Zhang     /* special case: ilu(0) with natural ordering */
57954dd39f65SShri Abhyankar     ierr = MatILUFactorSymbolic_SeqBAIJ_ilu0(fact,A,isrow,iscol,info);CHKERRQ(ierr);
57964dd39f65SShri Abhyankar     ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
579735aa4fcfSShri Abhyankar 
5798d5f3da31SBarry Smith     fact->factortype               = MAT_FACTOR_ILU;
579935aa4fcfSShri Abhyankar     (fact)->info.factor_mallocs    = 0;
580035aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_given  = info->fill;
580135aa4fcfSShri Abhyankar     (fact)->info.fill_ratio_needed = 1.0;
580235aa4fcfSShri Abhyankar     b                = (Mat_SeqBAIJ*)(fact)->data;
580335aa4fcfSShri Abhyankar     b->row           = isrow;
580435aa4fcfSShri Abhyankar     b->col           = iscol;
580535aa4fcfSShri Abhyankar     b->icol          = isicol;
580635aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
580735aa4fcfSShri Abhyankar     ierr             = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
580835aa4fcfSShri Abhyankar     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
580935aa4fcfSShri Abhyankar     ierr = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
581035aa4fcfSShri Abhyankar     PetscFunctionReturn(0);
581135aa4fcfSShri Abhyankar   }
581235aa4fcfSShri Abhyankar 
581335aa4fcfSShri Abhyankar   ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
581435aa4fcfSShri Abhyankar   ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
581535aa4fcfSShri Abhyankar 
581635aa4fcfSShri Abhyankar   /* get new row pointers */
581735aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bi);CHKERRQ(ierr);
581835aa4fcfSShri Abhyankar   bi[0] = 0;
581935aa4fcfSShri Abhyankar   /* bdiag is location of diagonal in factor */
582035aa4fcfSShri Abhyankar   ierr = PetscMalloc((n+1)*sizeof(PetscInt),&bdiag);CHKERRQ(ierr);
582135aa4fcfSShri Abhyankar   bdiag[0]  = 0;
582235aa4fcfSShri Abhyankar 
5823fca92195SBarry Smith   ierr = PetscMalloc2(n,PetscInt*,&bj_ptr,n,PetscInt*,&bjlvl_ptr);CHKERRQ(ierr);
582435aa4fcfSShri Abhyankar 
582535aa4fcfSShri Abhyankar   /* create a linked list for storing column indices of the active row */
582635aa4fcfSShri Abhyankar   nlnk = n + 1;
582735aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLCreate(n,n,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
582835aa4fcfSShri Abhyankar 
582935aa4fcfSShri Abhyankar   /* initial FreeSpace size is f*(ai[n]+1) */
583035aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space);CHKERRQ(ierr);
583135aa4fcfSShri Abhyankar   current_space = free_space;
583235aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceGet((PetscInt)(f*(ai[n]+1)),&free_space_lvl);CHKERRQ(ierr);
583335aa4fcfSShri Abhyankar   current_space_lvl = free_space_lvl;
583435aa4fcfSShri Abhyankar 
583535aa4fcfSShri Abhyankar   for (i=0; i<n; i++) {
583635aa4fcfSShri Abhyankar     nzi = 0;
583735aa4fcfSShri Abhyankar     /* copy current row into linked list */
583835aa4fcfSShri Abhyankar     nnz  = ai[r[i]+1] - ai[r[i]];
5839e32f2f54SBarry Smith     if (!nnz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[i],i);
584035aa4fcfSShri Abhyankar     cols = aj + ai[r[i]];
584135aa4fcfSShri Abhyankar     lnk[i] = -1; /* marker to indicate if diagonal exists */
584235aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLInit(nnz,cols,n,ic,nlnk,lnk,lnk_lvl,lnkbt);CHKERRQ(ierr);
584335aa4fcfSShri Abhyankar     nzi += nlnk;
584435aa4fcfSShri Abhyankar 
584535aa4fcfSShri Abhyankar     /* make sure diagonal entry is included */
584635aa4fcfSShri Abhyankar     if (diagonal_fill && lnk[i] == -1) {
584735aa4fcfSShri Abhyankar       fm = n;
584835aa4fcfSShri Abhyankar       while (lnk[fm] < i) fm = lnk[fm];
584935aa4fcfSShri Abhyankar       lnk[i]     = lnk[fm]; /* insert diagonal into linked list */
585035aa4fcfSShri Abhyankar       lnk[fm]    = i;
585135aa4fcfSShri Abhyankar       lnk_lvl[i] = 0;
585235aa4fcfSShri Abhyankar       nzi++; dcount++;
585335aa4fcfSShri Abhyankar     }
585435aa4fcfSShri Abhyankar 
585535aa4fcfSShri Abhyankar     /* add pivot rows into the active row */
585635aa4fcfSShri Abhyankar     nzbd = 0;
585735aa4fcfSShri Abhyankar     prow = lnk[n];
585835aa4fcfSShri Abhyankar     while (prow < i) {
585935aa4fcfSShri Abhyankar       nnz      = bdiag[prow];
586035aa4fcfSShri Abhyankar       cols     = bj_ptr[prow] + nnz + 1;
586135aa4fcfSShri Abhyankar       cols_lvl = bjlvl_ptr[prow] + nnz + 1;
586235aa4fcfSShri Abhyankar       nnz      = bi[prow+1] - bi[prow] - nnz - 1;
586335aa4fcfSShri Abhyankar       ierr = PetscILULLAddSorted(nnz,cols,levels,cols_lvl,prow,nlnk,lnk,lnk_lvl,lnkbt,prow);CHKERRQ(ierr);
586435aa4fcfSShri Abhyankar       nzi += nlnk;
586535aa4fcfSShri Abhyankar       prow = lnk[prow];
586635aa4fcfSShri Abhyankar       nzbd++;
586735aa4fcfSShri Abhyankar     }
586835aa4fcfSShri Abhyankar     bdiag[i] = nzbd;
586935aa4fcfSShri Abhyankar     bi[i+1]  = bi[i] + nzi;
587035aa4fcfSShri Abhyankar 
587135aa4fcfSShri Abhyankar     /* if free space is not available, make more free space */
587235aa4fcfSShri Abhyankar     if (current_space->local_remaining<nzi) {
587335aa4fcfSShri Abhyankar       nnz = 2*nzi*(n - i); /* estimated and max additional space needed */
587435aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space);CHKERRQ(ierr);
587535aa4fcfSShri Abhyankar       ierr = PetscFreeSpaceGet(nnz,&current_space_lvl);CHKERRQ(ierr);
587635aa4fcfSShri Abhyankar       reallocs++;
587735aa4fcfSShri Abhyankar     }
587835aa4fcfSShri Abhyankar 
587935aa4fcfSShri Abhyankar     /* copy data into free_space and free_space_lvl, then initialize lnk */
588035aa4fcfSShri Abhyankar     ierr = PetscIncompleteLLClean(n,n,nzi,lnk,lnk_lvl,current_space->array,current_space_lvl->array,lnkbt);CHKERRQ(ierr);
588135aa4fcfSShri Abhyankar     bj_ptr[i]    = current_space->array;
588235aa4fcfSShri Abhyankar     bjlvl_ptr[i] = current_space_lvl->array;
588335aa4fcfSShri Abhyankar 
588435aa4fcfSShri Abhyankar     /* make sure the active row i has diagonal entry */
588565e19b50SBarry Smith     if (*(bj_ptr[i]+bdiag[i]) != i) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\ntry running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",i);
588635aa4fcfSShri Abhyankar 
588735aa4fcfSShri Abhyankar     current_space->array           += nzi;
588835aa4fcfSShri Abhyankar     current_space->local_used      += nzi;
588935aa4fcfSShri Abhyankar     current_space->local_remaining -= nzi;
589035aa4fcfSShri Abhyankar     current_space_lvl->array           += nzi;
589135aa4fcfSShri Abhyankar     current_space_lvl->local_used      += nzi;
589235aa4fcfSShri Abhyankar     current_space_lvl->local_remaining -= nzi;
589335aa4fcfSShri Abhyankar   }
589435aa4fcfSShri Abhyankar 
589535aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
589635aa4fcfSShri Abhyankar   ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
589735aa4fcfSShri Abhyankar 
589835aa4fcfSShri Abhyankar   /* destroy list of free space and other temporary arrays */
589935aa4fcfSShri Abhyankar   ierr = PetscMalloc((bi[n]+1)*sizeof(PetscInt),&bj);CHKERRQ(ierr);
590035aa4fcfSShri Abhyankar 
590135aa4fcfSShri Abhyankar   /* copy free_space into bj and free free_space; set bi, bj, bdiag in new datastructure; */
59022ce24eb6SHong Zhang   ierr = PetscFreeSpaceContiguous_LU(&free_space,bj,n,bi,bdiag);CHKERRQ(ierr);
590335aa4fcfSShri Abhyankar 
590435aa4fcfSShri Abhyankar   ierr = PetscIncompleteLLDestroy(lnk,lnkbt);CHKERRQ(ierr);
590535aa4fcfSShri Abhyankar   ierr = PetscFreeSpaceDestroy(free_space_lvl);CHKERRQ(ierr);
5906fca92195SBarry Smith   ierr = PetscFree2(bj_ptr,bjlvl_ptr);CHKERRQ(ierr);
590735aa4fcfSShri Abhyankar 
590835aa4fcfSShri Abhyankar #if defined(PETSC_USE_INFO)
590935aa4fcfSShri Abhyankar   {
591035aa4fcfSShri Abhyankar     PetscReal af = ((PetscReal)bi[n])/((PetscReal)ai[n]);
591135aa4fcfSShri Abhyankar     ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocs,f,af);CHKERRQ(ierr);
591235aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"Run with -[sub_]pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
591335aa4fcfSShri Abhyankar     ierr = PetscInfo1(A,"PCFactorSetFill([sub]pc,%G);\n",af);CHKERRQ(ierr);
591435aa4fcfSShri Abhyankar     ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
591535aa4fcfSShri Abhyankar     if (diagonal_fill) {
591635aa4fcfSShri Abhyankar       ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals",dcount);CHKERRQ(ierr);
591735aa4fcfSShri Abhyankar     }
591835aa4fcfSShri Abhyankar   }
591935aa4fcfSShri Abhyankar #endif
592035aa4fcfSShri Abhyankar 
592135aa4fcfSShri Abhyankar   /* put together the new matrix */
592235aa4fcfSShri Abhyankar   ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
592335aa4fcfSShri Abhyankar   ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
592435aa4fcfSShri Abhyankar   b = (Mat_SeqBAIJ*)(fact)->data;
592535aa4fcfSShri Abhyankar   b->free_a       = PETSC_TRUE;
592635aa4fcfSShri Abhyankar   b->free_ij      = PETSC_TRUE;
592735aa4fcfSShri Abhyankar   b->singlemalloc = PETSC_FALSE;
592835aa4fcfSShri Abhyankar   ierr = PetscMalloc( (bs2*(bdiag[0]+1) )*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
592935aa4fcfSShri Abhyankar   b->j          = bj;
593035aa4fcfSShri Abhyankar   b->i          = bi;
593135aa4fcfSShri Abhyankar   b->diag       = bdiag;
593235aa4fcfSShri Abhyankar   b->free_diag  = PETSC_TRUE;
593335aa4fcfSShri Abhyankar   b->ilen       = 0;
593435aa4fcfSShri Abhyankar   b->imax       = 0;
593535aa4fcfSShri Abhyankar   b->row        = isrow;
593635aa4fcfSShri Abhyankar   b->col        = iscol;
593735aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
593835aa4fcfSShri Abhyankar   ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
593935aa4fcfSShri Abhyankar   b->icol       = isicol;
594035aa4fcfSShri Abhyankar   ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
594135aa4fcfSShri Abhyankar   /* In b structure:  Free imax, ilen, old a, old j.
594235aa4fcfSShri Abhyankar      Allocate bdiag, solve_work, new a, new j */
594335aa4fcfSShri Abhyankar   ierr = PetscLogObjectMemory(fact,(bdiag[0]+1) * (sizeof(PetscInt)+bs2*sizeof(PetscScalar)));CHKERRQ(ierr);
594435aa4fcfSShri Abhyankar   b->maxnz = b->nz = bdiag[0]+1;
5945ae3d28f0SHong Zhang   fact->info.factor_mallocs    = reallocs;
5946ae3d28f0SHong Zhang   fact->info.fill_ratio_given  = f;
5947ae3d28f0SHong Zhang   fact->info.fill_ratio_needed = ((PetscReal)(bdiag[0]+1))/((PetscReal)ai[n]);
59484dd39f65SShri Abhyankar   ierr = MatSeqBAIJSetNumericFactorization(fact,both_identity);CHKERRQ(ierr);
594935aa4fcfSShri Abhyankar   PetscFunctionReturn(0);
595035aa4fcfSShri Abhyankar }
595135aa4fcfSShri Abhyankar 
595235aa4fcfSShri Abhyankar 
59534e2b4712SSatish Balay /*
59544e2b4712SSatish Balay      This code is virtually identical to MatILUFactorSymbolic_SeqAIJ
59554e2b4712SSatish Balay    except that the data structure of Mat_SeqAIJ is slightly different.
59564e2b4712SSatish Balay    Not a good example of code reuse.
59574e2b4712SSatish Balay */
59584a2ae208SSatish Balay #undef __FUNCT__
595906e38f1dSHong Zhang #define __FUNCT__ "MatILUFactorSymbolic_SeqBAIJ_inplace"
596006e38f1dSHong Zhang PetscErrorCode MatILUFactorSymbolic_SeqBAIJ_inplace(Mat fact,Mat A,IS isrow,IS iscol,const MatFactorInfo *info)
59614e2b4712SSatish Balay {
59624e2b4712SSatish Balay   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ*)A->data,*b;
59634e2b4712SSatish Balay   IS             isicol;
59646849ba73SBarry Smith   PetscErrorCode ierr;
59655d0c19d7SBarry Smith   const PetscInt *r,*ic,*ai = a->i,*aj = a->j,*xi;
59665d0c19d7SBarry Smith   PetscInt       prow,n = a->mbs,*ainew,*ajnew,jmax,*fill,nz,*im,*ajfill,*flev,*xitmp;
5967a96a251dSBarry Smith   PetscInt       *dloc,idx,row,m,fm,nzf,nzi,reallocate = 0,dcount = 0;
5968d0f46423SBarry Smith   PetscInt       incrlev,nnz,i,bs = A->rmap->bs,bs2 = a->bs2,levels,diagonal_fill,dd;
596941df41f0SMatthew Knepley   PetscTruth     col_identity,row_identity,both_identity,flg;
5970329f5518SBarry Smith   PetscReal      f;
59714e2b4712SSatish Balay 
59724e2b4712SSatish Balay   PetscFunctionBegin;
59736bce7ff8SHong Zhang   ierr = MatMissingDiagonal_SeqBAIJ(A,&flg,&dd);CHKERRQ(ierr);
5974e32f2f54SBarry Smith   if (flg) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Matrix A is missing diagonal entry in row %D",dd);
59756bce7ff8SHong Zhang 
5976435faa5fSBarry Smith   f             = info->fill;
5977690b6cddSBarry Smith   levels        = (PetscInt)info->levels;
5978690b6cddSBarry Smith   diagonal_fill = (PetscInt)info->diagonal_fill;
59794c49b128SBarry Smith   ierr = ISInvertPermutation(iscol,PETSC_DECIDE,&isicol);CHKERRQ(ierr);
598016a2bf60SHong Zhang 
5981667159a5SBarry Smith   ierr = ISIdentity(isrow,&row_identity);CHKERRQ(ierr);
5982667159a5SBarry Smith   ierr = ISIdentity(iscol,&col_identity);CHKERRQ(ierr);
59837d18ce8fSMatthew Knepley   both_identity = (PetscTruth) (row_identity && col_identity);
5984309c388cSBarry Smith 
598541df41f0SMatthew Knepley   if (!levels && both_identity) {  /* special case copy the nonzero structure */
598616a2bf60SHong Zhang     ierr = MatDuplicateNoCreate_SeqBAIJ(fact,A,MAT_DO_NOT_COPY_VALUES,PETSC_TRUE);CHKERRQ(ierr);
59878b1456e3SHong Zhang     ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
59886bce7ff8SHong Zhang 
5989d5f3da31SBarry Smith     fact->factortype = MAT_FACTOR_ILU;
5990ae3d28f0SHong Zhang     b            = (Mat_SeqBAIJ*)fact->data;
5991bb3d539aSBarry Smith     b->row       = isrow;
5992bb3d539aSBarry Smith     b->col       = iscol;
5993bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
5994bb3d539aSBarry Smith     ierr         = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
5995bb3d539aSBarry Smith     b->icol      = isicol;
5996bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
5997b588c5a2SHong Zhang     ierr         = PetscMalloc((n+1)*bs*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
59986bce7ff8SHong Zhang     PetscFunctionReturn(0);
59996bce7ff8SHong Zhang   }
60006bce7ff8SHong Zhang 
60016bce7ff8SHong Zhang   /* general case perform the symbolic factorization */
60024e2b4712SSatish Balay     ierr = ISGetIndices(isrow,&r);CHKERRQ(ierr);
60034e2b4712SSatish Balay     ierr = ISGetIndices(isicol,&ic);CHKERRQ(ierr);
60044e2b4712SSatish Balay 
60054e2b4712SSatish Balay     /* get new row pointers */
6006690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&ainew);CHKERRQ(ierr);
60074e2b4712SSatish Balay     ainew[0] = 0;
60084e2b4712SSatish Balay     /* don't know how many column pointers are needed so estimate */
6009690b6cddSBarry Smith     jmax = (PetscInt)(f*ai[n] + 1);
6010690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajnew);CHKERRQ(ierr);
60114e2b4712SSatish Balay     /* ajfill is level of fill for each fill entry */
6012690b6cddSBarry Smith     ierr = PetscMalloc((jmax)*sizeof(PetscInt),&ajfill);CHKERRQ(ierr);
60134e2b4712SSatish Balay     /* fill is a linked list of nonzeros in active row */
6014690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&fill);CHKERRQ(ierr);
60154e2b4712SSatish Balay     /* im is level for each filled value */
6016690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&im);CHKERRQ(ierr);
60174e2b4712SSatish Balay     /* dloc is location of diagonal in factor */
6018690b6cddSBarry Smith     ierr = PetscMalloc((n+1)*sizeof(PetscInt),&dloc);CHKERRQ(ierr);
60194e2b4712SSatish Balay     dloc[0]  = 0;
60204e2b4712SSatish Balay     for (prow=0; prow<n; prow++) {
6021435faa5fSBarry Smith 
6022435faa5fSBarry Smith       /* copy prow into linked list */
60234e2b4712SSatish Balay       nzf        = nz  = ai[r[prow]+1] - ai[r[prow]];
6024e32f2f54SBarry Smith       if (!nz) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Empty row in matrix: row in original ordering %D in permuted ordering %D",r[prow],prow);
60254e2b4712SSatish Balay       xi         = aj + ai[r[prow]];
60264e2b4712SSatish Balay       fill[n]    = n;
6027435faa5fSBarry Smith       fill[prow] = -1; /* marker for diagonal entry */
60284e2b4712SSatish Balay       while (nz--) {
60294e2b4712SSatish Balay 	fm  = n;
60304e2b4712SSatish Balay 	idx = ic[*xi++];
60314e2b4712SSatish Balay 	do {
60324e2b4712SSatish Balay 	  m  = fm;
60334e2b4712SSatish Balay 	  fm = fill[m];
60344e2b4712SSatish Balay 	} while (fm < idx);
60354e2b4712SSatish Balay 	fill[m]   = idx;
60364e2b4712SSatish Balay 	fill[idx] = fm;
60374e2b4712SSatish Balay 	im[idx]   = 0;
60384e2b4712SSatish Balay       }
6039435faa5fSBarry Smith 
6040435faa5fSBarry Smith       /* make sure diagonal entry is included */
6041435faa5fSBarry Smith       if (diagonal_fill && fill[prow] == -1) {
6042435faa5fSBarry Smith 	fm = n;
6043435faa5fSBarry Smith 	while (fill[fm] < prow) fm = fill[fm];
6044435faa5fSBarry Smith 	fill[prow] = fill[fm];  /* insert diagonal into linked list */
6045435faa5fSBarry Smith 	fill[fm]   = prow;
6046435faa5fSBarry Smith 	im[prow]   = 0;
6047435faa5fSBarry Smith 	nzf++;
6048335d9088SBarry Smith 	dcount++;
6049435faa5fSBarry Smith       }
6050435faa5fSBarry Smith 
60514e2b4712SSatish Balay       nzi = 0;
60524e2b4712SSatish Balay       row = fill[n];
60534e2b4712SSatish Balay       while (row < prow) {
60544e2b4712SSatish Balay 	incrlev = im[row] + 1;
60554e2b4712SSatish Balay 	nz      = dloc[row];
6056435faa5fSBarry Smith 	xi      = ajnew  + ainew[row] + nz + 1;
60574e2b4712SSatish Balay 	flev    = ajfill + ainew[row] + nz + 1;
60584e2b4712SSatish Balay 	nnz     = ainew[row+1] - ainew[row] - nz - 1;
60594e2b4712SSatish Balay 	fm      = row;
60604e2b4712SSatish Balay 	while (nnz-- > 0) {
60614e2b4712SSatish Balay 	  idx = *xi++;
60624e2b4712SSatish Balay 	  if (*flev + incrlev > levels) {
60634e2b4712SSatish Balay 	    flev++;
60644e2b4712SSatish Balay 	    continue;
60654e2b4712SSatish Balay 	  }
60664e2b4712SSatish Balay 	  do {
60674e2b4712SSatish Balay 	    m  = fm;
60684e2b4712SSatish Balay 	    fm = fill[m];
60694e2b4712SSatish Balay 	  } while (fm < idx);
60704e2b4712SSatish Balay 	  if (fm != idx) {
60714e2b4712SSatish Balay 	    im[idx]   = *flev + incrlev;
60724e2b4712SSatish Balay 	    fill[m]   = idx;
60734e2b4712SSatish Balay 	    fill[idx] = fm;
60744e2b4712SSatish Balay 	    fm        = idx;
60754e2b4712SSatish Balay 	    nzf++;
6076ecf371e4SBarry Smith 	  } else {
60774e2b4712SSatish Balay 	    if (im[idx] > *flev + incrlev) im[idx] = *flev+incrlev;
60784e2b4712SSatish Balay 	  }
60794e2b4712SSatish Balay 	  flev++;
60804e2b4712SSatish Balay 	}
60814e2b4712SSatish Balay 	row = fill[row];
60824e2b4712SSatish Balay 	nzi++;
60834e2b4712SSatish Balay       }
60844e2b4712SSatish Balay       /* copy new filled row into permanent storage */
60854e2b4712SSatish Balay       ainew[prow+1] = ainew[prow] + nzf;
60864e2b4712SSatish Balay       if (ainew[prow+1] > jmax) {
6087ecf371e4SBarry Smith 
6088ecf371e4SBarry Smith 	/* estimate how much additional space we will need */
6089ecf371e4SBarry Smith 	/* use the strategy suggested by David Hysom <hysom@perch-t.icase.edu> */
6090ecf371e4SBarry Smith 	/* just double the memory each time */
6091690b6cddSBarry Smith 	PetscInt maxadd = jmax;
6092ecf371e4SBarry Smith 	/* maxadd = (int)(((f*ai[n]+1)*(n-prow+5))/n); */
60934e2b4712SSatish Balay 	if (maxadd < nzf) maxadd = (n-prow)*(nzf+1);
60944e2b4712SSatish Balay 	jmax += maxadd;
6095ecf371e4SBarry Smith 
6096ecf371e4SBarry Smith 	/* allocate a longer ajnew and ajfill */
60975d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
60985d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajnew,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6099606d414cSSatish Balay 	ierr = PetscFree(ajnew);CHKERRQ(ierr);
61005d0c19d7SBarry Smith 	ajnew = xitmp;
61015d0c19d7SBarry Smith 	ierr = PetscMalloc(jmax*sizeof(PetscInt),&xitmp);CHKERRQ(ierr);
61025d0c19d7SBarry Smith 	ierr = PetscMemcpy(xitmp,ajfill,ainew[prow]*sizeof(PetscInt));CHKERRQ(ierr);
6103606d414cSSatish Balay 	ierr = PetscFree(ajfill);CHKERRQ(ierr);
61045d0c19d7SBarry Smith 	ajfill = xitmp;
6105eb150c5cSKris Buschelman 	reallocate++; /* count how many reallocations are needed */
61064e2b4712SSatish Balay       }
61075d0c19d7SBarry Smith       xitmp       = ajnew + ainew[prow];
61084e2b4712SSatish Balay       flev        = ajfill + ainew[prow];
61094e2b4712SSatish Balay       dloc[prow]  = nzi;
61104e2b4712SSatish Balay       fm          = fill[n];
61114e2b4712SSatish Balay       while (nzf--) {
61125d0c19d7SBarry Smith 	*xitmp++ = fm;
61134e2b4712SSatish Balay 	*flev++ = im[fm];
61144e2b4712SSatish Balay 	fm      = fill[fm];
61154e2b4712SSatish Balay       }
6116435faa5fSBarry Smith       /* make sure row has diagonal entry */
6117435faa5fSBarry Smith       if (ajnew[ainew[prow]+dloc[prow]] != prow) {
6118e32f2f54SBarry Smith 	SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"Row %D has missing diagonal in factored matrix\n\
61192401956bSBarry Smith     try running with -pc_factor_nonzeros_along_diagonal or -pc_factor_diagonal_fill",prow);
6120435faa5fSBarry Smith       }
61214e2b4712SSatish Balay     }
6122606d414cSSatish Balay     ierr = PetscFree(ajfill);CHKERRQ(ierr);
61234e2b4712SSatish Balay     ierr = ISRestoreIndices(isrow,&r);CHKERRQ(ierr);
61244e2b4712SSatish Balay     ierr = ISRestoreIndices(isicol,&ic);CHKERRQ(ierr);
6125606d414cSSatish Balay     ierr = PetscFree(fill);CHKERRQ(ierr);
6126606d414cSSatish Balay     ierr = PetscFree(im);CHKERRQ(ierr);
61274e2b4712SSatish Balay 
61286cf91177SBarry Smith #if defined(PETSC_USE_INFO)
61294e2b4712SSatish Balay     {
6130329f5518SBarry Smith       PetscReal af = ((PetscReal)ainew[n])/((PetscReal)ai[n]);
6131ae15b995SBarry Smith       ierr = PetscInfo3(A,"Reallocs %D Fill ratio:given %G needed %G\n",reallocate,f,af);CHKERRQ(ierr);
6132ae15b995SBarry Smith       ierr = PetscInfo1(A,"Run with -pc_factor_fill %G or use \n",af);CHKERRQ(ierr);
6133ae15b995SBarry Smith       ierr = PetscInfo1(A,"PCFactorSetFill(pc,%G);\n",af);CHKERRQ(ierr);
6134ae15b995SBarry Smith       ierr = PetscInfo(A,"for best performance.\n");CHKERRQ(ierr);
6135335d9088SBarry Smith       if (diagonal_fill) {
6136ae15b995SBarry Smith 	ierr = PetscInfo1(A,"Detected and replaced %D missing diagonals\n",dcount);CHKERRQ(ierr);
6137335d9088SBarry Smith       }
61384e2b4712SSatish Balay     }
613963ba0a88SBarry Smith #endif
61404e2b4712SSatish Balay 
61414e2b4712SSatish Balay     /* put together the new matrix */
6142719d5645SBarry Smith     ierr = MatSeqBAIJSetPreallocation_SeqBAIJ(fact,bs,MAT_SKIP_ALLOCATION,PETSC_NULL);CHKERRQ(ierr);
6143719d5645SBarry Smith     ierr = PetscLogObjectParent(fact,isicol);CHKERRQ(ierr);
6144ae3d28f0SHong Zhang     b    = (Mat_SeqBAIJ*)fact->data;
6145e6b907acSBarry Smith     b->free_a       = PETSC_TRUE;
6146e6b907acSBarry Smith     b->free_ij      = PETSC_TRUE;
61477c922b88SBarry Smith     b->singlemalloc = PETSC_FALSE;
6148a96a251dSBarry Smith     ierr = PetscMalloc(bs2*ainew[n]*sizeof(MatScalar),&b->a);CHKERRQ(ierr);
61494e2b4712SSatish Balay     b->j          = ajnew;
61504e2b4712SSatish Balay     b->i          = ainew;
61514e2b4712SSatish Balay     for (i=0; i<n; i++) dloc[i] += ainew[i];
61524e2b4712SSatish Balay     b->diag       = dloc;
61537f53bb6cSHong Zhang     b->free_diag  = PETSC_TRUE;
61544e2b4712SSatish Balay     b->ilen       = 0;
61554e2b4712SSatish Balay     b->imax       = 0;
61564e2b4712SSatish Balay     b->row        = isrow;
61574e2b4712SSatish Balay     b->col        = iscol;
6158bcd9e38bSBarry Smith     b->pivotinblocks = (info->pivotinblocks) ? PETSC_TRUE : PETSC_FALSE;
6159c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)isrow);CHKERRQ(ierr);
6160c38d4ed2SBarry Smith     ierr          = PetscObjectReference((PetscObject)iscol);CHKERRQ(ierr);
6161e51c0b9cSSatish Balay     b->icol       = isicol;
616287828ca2SBarry Smith     ierr = PetscMalloc((bs*n+bs)*sizeof(PetscScalar),&b->solve_work);CHKERRQ(ierr);
61634e2b4712SSatish Balay     /* In b structure:  Free imax, ilen, old a, old j.
61644e2b4712SSatish Balay        Allocate dloc, solve_work, new a, new j */
6165719d5645SBarry Smith     ierr = PetscLogObjectMemory(fact,(ainew[n]-n)*(sizeof(PetscInt))+bs2*ainew[n]*sizeof(PetscScalar));CHKERRQ(ierr);
61664e2b4712SSatish Balay     b->maxnz          = b->nz = ainew[n];
61674e2b4712SSatish Balay 
6168ae3d28f0SHong Zhang     fact->info.factor_mallocs    = reallocate;
6169ae3d28f0SHong Zhang     fact->info.fill_ratio_given  = f;
6170ae3d28f0SHong Zhang     fact->info.fill_ratio_needed = ((PetscReal)ainew[n])/((PetscReal)ai[prow]);
61716bce7ff8SHong Zhang 
61728b1456e3SHong Zhang   ierr = MatSeqBAIJSetNumericFactorization_inplace(fact,both_identity);CHKERRQ(ierr);
61738661488fSKris Buschelman   PetscFunctionReturn(0);
61748661488fSKris Buschelman }
61758661488fSKris Buschelman 
6176732ee342SKris Buschelman #undef __FUNCT__
61777e7071cdSKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE"
6178dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE(Mat A)
61797e7071cdSKris Buschelman {
618012272027SHong Zhang   /* Mat_SeqBAIJ *a = (Mat_SeqBAIJ *)A->data; */
618112272027SHong Zhang   /* int i,*AJ=a->j,nz=a->nz; */
61825a9542e3SKris Buschelman   PetscFunctionBegin;
61837cf1b8d3SKris Buschelman   /* Undo Column scaling */
61847cf1b8d3SKris Buschelman /*    while (nz--) { */
61857cf1b8d3SKris Buschelman /*      AJ[i] = AJ[i]/4; */
61867cf1b8d3SKris Buschelman /*    } */
6187c115a38dSKris Buschelman   /* This should really invoke a push/pop logic, but we don't have that yet. */
6188c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
61897cf1b8d3SKris Buschelman   PetscFunctionReturn(0);
61907cf1b8d3SKris Buschelman }
61917cf1b8d3SKris Buschelman 
61927cf1b8d3SKris Buschelman #undef __FUNCT__
61937cf1b8d3SKris Buschelman #define __FUNCT__ "MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj"
6194dfbe8321SBarry Smith PetscErrorCode MatSetUnfactored_SeqBAIJ_4_NaturalOrdering_SSE_usj(Mat A)
61957cf1b8d3SKris Buschelman {
61967cf1b8d3SKris Buschelman   Mat_SeqBAIJ    *a = (Mat_SeqBAIJ *)A->data;
6197b24ad042SBarry Smith   PetscInt       *AJ=a->j,nz=a->nz;
61982aa5897fSKris Buschelman   unsigned short *aj=(unsigned short *)AJ;
61995a9542e3SKris Buschelman   PetscFunctionBegin;
62000b9da03eSKris Buschelman   /* Is this really necessary? */
620120235379SKris Buschelman   while (nz--) {
62020b9da03eSKris Buschelman     AJ[nz] = (int)((unsigned int)aj[nz]); /* First extend, then convert to signed. */
62037e7071cdSKris Buschelman   }
6204c115a38dSKris Buschelman   A->ops->setunfactored = PETSC_NULL;
62057e7071cdSKris Buschelman   PetscFunctionReturn(0);
62067e7071cdSKris Buschelman }
62077e7071cdSKris Buschelman 
6208732ee342SKris Buschelman 
6209